MEDIUM: samples: add a regsub converter to perform regex-based transformations

author Willy Tarreau <w@1wt.eu>

Tue, 20 Jan 2015 18:47:06 +0000 (19:47 +0100)

committer Willy Tarreau <w@1wt.eu>

Thu, 22 Jan 2015 13:24:53 +0000 (14:24 +0100)
author Willy Tarreau <w@1wt.eu>
Tue, 20 Jan 2015 18:47:06 +0000 (19:47 +0100)
committer Willy Tarreau <w@1wt.eu>
Thu, 22 Jan 2015 13:24:53 +0000 (14:24 +0100)
diff --git a/doc/configuration.txt b/doc/configuration.txt

index e899297cd01bdb0aebf6d1ed910f03d4c3e0154e..380fb836b7269fc25bf1c784bafa979a2fedbbcb 100644 (file)
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -10328,6 +10328,28 @@ map_<match_type>_<output_type>(<map_file>[,<default_value>])
        |       `---------------------------- key
        `------------------------------------ leading spaces ignored
  
+regsub(<regex>,<subst>[,<flags>)
+  Applies a regex-based substitution to the input string. It does the same
+  operation as the well-known "sed" utility with "s/<regex>/<subst>/". By
+  default it will replace in the input string the first occurrence of the
+  largest part matching the regular expression <regex> with the substitution
+  string <subst>. It is possible to replace all occurrences instead by adding
+  the flag "g" in the third argument <flags>. It is also possible to make the
+  regex case insensitive by adding the flag "i" in <flags>. Since <flags> is a
+  string, it is made up from the concatenation of all desired flags. Thus if
+  both "i" and "g" are desired, using "gi" or "ig" will have the same effect.
+  It is important to note that due to the current limitations of the
+  configuration parser, some characters such as closing parenthesis or comma
+  are not possible to use in the arguments. The first use of this converter is
+  to replace certain characters or sequence of characters with other ones.
+
+  Example :
+
+     # de-duplicate "/" in header "x-path".
+     # input:  x-path: /////a///b/c/xzxyz/
+     # output: x-path: /a/b/c/xzxyz/
+     http-request set-header x-path %[hdr(x-path),regsub(/+,/,g)]
+
  sdbm([<avalanche>])
    Hashes a binary input sample into an unsigned 32-bit quantity using the SDBM
    hash function. Optionally, it is possible to apply a full avalanche hash
diff --git a/src/sample.c b/src/sample.c

index 8fe341f3990df75b818b00e56661db290f6b227b..598bb533e178b0057f6547620aa6917d598fd27e 100644 (file)
--- a/src/sample.c
+++ b/src/sample.c
@@ -1782,6 +1782,112 @@ found:
         return 1;
  }
  
+static int sample_conv_regsub_check(struct arg *args, struct sample_conv *conv,
+                                    const char *file, int line, char **err)
+{
+       struct arg *arg = args;
+       char *p;
+       int len;
+
+       /* arg0 is a regex, it uses type_flag for ICASE and global match */
+       arg[0].type_flags = 0;
+
+       if (arg[2].type != ARGT_STR)
+               return 1;
+
+       p = arg[2].data.str.str;
+       len = arg[2].data.str.len;
+       while (len) {
+               if (*p == 'i') {
+                       arg[0].type_flags |= ARGF_REG_ICASE;
+               }
+               else if (*p == 'g') {
+                       arg[0].type_flags |= ARGF_REG_GLOB;
+               }
+               else {
+                       memprintf(err, "invalid regex flag '%c', only 'i' and 'g' are supported", *p);
+                       return 0;
+               }
+               p++;
+               len--;
+       }
+       return 1;
+}
+
+/* This sample function is designed to do the equivalent of s/match/replace/ on
+ * the input string. It applies a regex and restarts from the last matched
+ * location until nothing matches anymore. First arg is the regex to apply to
+ * the input string, second arg is the replacement expression.
+ */
+static int sample_conv_regsub(const struct arg *arg_p, struct sample *smp)
+{
+       char *start, *end;
+       struct my_regex *reg = arg_p[0].data.reg;
+       regmatch_t pmatch[MAX_MATCH];
+       struct chunk *trash = get_trash_chunk();
+       int flag, max;
+       int found;
+
+       start = smp->data.str.str;
+       end = start + smp->data.str.len;
+
+       flag = 0;
+       while (1) {
+               /* check for last round which is used to copy remaining parts
+                * when not running in global replacement mode.
+                */
+               found = 0;
+               if ((arg_p[0].type_flags & ARGF_REG_GLOB) || !(flag & REG_NOTBOL)) {
+                       /* Note: we can have start == end on empty strings or at the end */
+                       found = regex_exec_match2(reg, start, end - start, MAX_MATCH, pmatch, flag);
+               }
+
+               if (!found)
+                       pmatch[0].rm_so = end - start;
+
+               /* copy the heading non-matching part (which may also be the tail if nothing matches) */
+               max = trash->size - trash->len;
+               if (max && pmatch[0].rm_so > 0) {
+                       if (max > pmatch[0].rm_so)
+                               max = pmatch[0].rm_so;
+                       memcpy(trash->str + trash->len, start, max);
+                       trash->len += max;
+               }
+
+               if (!found)
+                       break;
+
+               /* replace the matching part */
+               max = trash->size - trash->len;
+               if (max) {
+                       if (max > arg_p[1].data.str.len)
+                               max = arg_p[1].data.str.len;
+                       memcpy(trash->str + trash->len, arg_p[1].data.str.str, max);
+                       trash->len += max;
+               }
+
+               /* stop here if we're done with this string */
+               if (start >= end)
+                       break;
+
+               /* We have a special case for matches of length 0 (eg: "x*y*").
+                * These ones are considered to match in front of a character,
+                * so we have to copy that character and skip to the next one.
+                */
+               if (!pmatch[0].rm_eo) {
+                       if (trash->len < trash->size)
+                               trash->str[trash->len++] = start[pmatch[0].rm_eo];
+                       pmatch[0].rm_eo++;
+               }
+
+               start += pmatch[0].rm_eo;
+               flag |= REG_NOTBOL;
+       }
+
+       smp->data.str = *trash;
+       return 1;
+}
+
  /************************************************************************/
  /*       All supported sample fetch functions must be declared here     */
  /************************************************************************/
@@ -1927,6 +2033,7 @@ static struct sample_conv_kw_list sample_conv_kws = {ILH, {
         { "bytes",  sample_conv_bytes,     ARG2(1,UINT,UINT), NULL, SMP_T_BIN,  SMP_T_BIN },
         { "field",  sample_conv_field,     ARG2(2,UINT,STR), sample_conv_field_check, SMP_T_STR,  SMP_T_STR },
         { "word",   sample_conv_word,      ARG2(2,UINT,STR), sample_conv_field_check, SMP_T_STR,  SMP_T_STR },
+       { "regsub", sample_conv_regsub,    ARG3(2,REG,STR,STR), sample_conv_regsub_check, SMP_T_STR, SMP_T_STR },
         { NULL, NULL, 0, 0, 0 },
  }};
author	Willy Tarreau <w@1wt.eu>
	Tue, 20 Jan 2015 18:47:06 +0000 (19:47 +0100)
committer	Willy Tarreau <w@1wt.eu>
	Thu, 22 Jan 2015 13:24:53 +0000 (14:24 +0100)
doc/configuration.txt		patch \| blob \| blame \| history
src/sample.c		patch \| blob \| blame \| history