]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Amend recent fix for SIMILAR TO regex conversion.
authorTom Lane <tgl@sss.pgh.pa.us>
Sat, 13 Sep 2025 20:55:51 +0000 (16:55 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Sat, 13 Sep 2025 20:55:51 +0000 (16:55 -0400)
Commit e3ffc3e91 fixed the translation of character classes in
SIMILAR TO regular expressions.  Unfortunately the fix broke a corner
case: if there is an escape character right after the opening bracket
(for example in "[\q]"), a closing bracket right after the escape
sequence would not be seen as closing the character class.

There were two more oversights: a backslash or a nested opening bracket
right at the beginning of a character class should remove the special
meaning from any following caret or closing bracket.

This bug suggests that this code needs to be more readable, so also
rename the variables "charclass_depth" and "charclass_start" to
something more meaningful, rewrite an "if" cascade to be more
consistent, and improve the commentary.

Reported-by: Dominique Devienne <ddevienne@gmail.com>
Reported-by: Stephan Springl <springl-psql@bfw-online.de>
Author: Laurenz Albe <laurenz.albe@cybertec.at>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://postgr.es/m/CAFCRh-8NwJd0jq6P=R3qhHyqU7hw0BTor3W0SvUcii24et+zAw@mail.gmail.com
Backpatch-through: 13

src/backend/utils/adt/regexp.c
src/test/regress/expected/strings.out
src/test/regress/sql/strings.sql

index 24058a4bc93790c16f475eb6987dabe0fe7d8a67..4d2ea4848fbdc979d04bc27b3ee2cdb3446385b0 100644 (file)
@@ -674,10 +674,8 @@ similar_escape_internal(text *pat_text, text *esc_text)
                                elen;
        bool            afterescape = false;
        int                     nquotes = 0;
-       int                     charclass_depth = 0;    /* Nesting level of character classes,
-                                                                                * encompassed by square brackets */
-       int                     charclass_start = 0;    /* State of the character class start,
-                                                                                * for carets */
+       int                     bracket_depth = 0;      /* square bracket nesting level */
+       int                     charclass_pos = 0;      /* position inside a character class */
 
        p = VARDATA_ANY(pat_text);
        plen = VARSIZE_ANY_EXHDR(pat_text);
@@ -736,6 +734,17 @@ similar_escape_internal(text *pat_text, text *esc_text)
         * the relevant part separators in the above expansion.  If the result
         * of this function is used in a plain regexp match (SIMILAR TO), the
         * escape-double-quotes have no effect on the match behavior.
+        *
+        * While we don't fully validate character classes (bracket expressions),
+        * we do need to parse them well enough to know where they end.
+        * "charclass_pos" tracks where we are in a character class.
+        * Its value is uninteresting when bracket_depth is 0.
+        * But when bracket_depth > 0, it will be
+        *   1: right after the opening '[' (a following '^' will negate
+        *      the class, while ']' is a literal character)
+        *   2: right after a '^' after the opening '[' (']' is still a literal
+        *      character)
+        *   3 or more: further inside the character class (']' ends the class)
         *----------
         */
 
@@ -807,7 +816,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
                /* fast path */
                if (afterescape)
                {
-                       if (pchar == '"' && charclass_depth < 1)        /* escape-double-quote? */
+                       if (pchar == '"' && bracket_depth < 1)  /* escape-double-quote? */
                        {
                                /* emit appropriate part separator, per notes above */
                                if (nquotes == 0)
@@ -848,6 +857,12 @@ similar_escape_internal(text *pat_text, text *esc_text)
                                 */
                                *r++ = '\\';
                                *r++ = pchar;
+
+                               /*
+                                * If we encounter an escaped character in a character class,
+                                * we are no longer at the beginning.
+                                */
+                               charclass_pos = 3;
                        }
                        afterescape = false;
                }
@@ -856,41 +871,69 @@ similar_escape_internal(text *pat_text, text *esc_text)
                        /* SQL escape character; do not send to output */
                        afterescape = true;
                }
-               else if (charclass_depth > 0)
+               else if (bracket_depth > 0)
                {
+                       /* inside a character class */
                        if (pchar == '\\')
+                       {
+                               /*
+                                * If we're here, backslash is not the SQL escape character,
+                                * so treat it as a literal class element, which requires
+                                * doubling it.  (This matches our behavior for backslashes
+                                * outside character classes.)
+                                */
                                *r++ = '\\';
+                       }
                        *r++ = pchar;
 
-                       /*
-                        * Ignore a closing bracket at the start of a character class.
-                        * Such a bracket is taken literally rather than closing the
-                        * class.  "charclass_start" is 1 right at the beginning of a
-                        * class and 2 after an initial caret.
-                        */
-                       if (pchar == ']' && charclass_start > 2)
-                               charclass_depth--;
+                       /* parse the character class well enough to identify ending ']' */
+                       if (pchar == ']' && charclass_pos > 2)
+                       {
+                               /* found the real end of a bracket pair */
+                               bracket_depth--;
+                               /* don't reset charclass_pos, this may be an inner bracket */
+                       }
                        else if (pchar == '[')
-                               charclass_depth++;
+                       {
+                               /* start of a nested bracket pair */
+                               bracket_depth++;
 
-                       /*
-                        * If there is a caret right after the opening bracket, it negates
-                        * the character class, but a following closing bracket should
-                        * still be treated as a normal character.  That holds only for
-                        * the first caret, so only the values 1 and 2 mean that closing
-                        * brackets should be taken literally.
-                        */
-                       if (pchar == '^')
-                               charclass_start++;
+                               /*
+                                * We are no longer at the beginning of a character class.
+                                * (The nested bracket pair is a collating element, not a
+                                * character class in its own right.)
+                                */
+                               charclass_pos = 3;
+                       }
+                       else if (pchar == '^')
+                       {
+                               /*
+                                * A caret right after the opening bracket negates the
+                                * character class.  In that case, the following will
+                                * increment charclass_pos from 1 to 2, so that a following
+                                * ']' is still a literal character and does not end the
+                                * character class.  If we are further inside a character
+                                * class, charclass_pos might get incremented past 3, which is
+                                * fine.
+                                */
+                               charclass_pos++;
+                       }
                        else
-                               charclass_start = 3;    /* definitely past the start */
+                       {
+                               /*
+                                * Anything else (including a backslash or leading ']') is an
+                                * element of the character class, so we are no longer at the
+                                * beginning of the class.
+                                */
+                               charclass_pos = 3;
+                       }
                }
                else if (pchar == '[')
                {
                        /* start of a character class */
                        *r++ = pchar;
-                       charclass_depth++;
-                       charclass_start = 1;
+                       bracket_depth = 1;
+                       charclass_pos = 1;
                }
                else if (pchar == '%')
                {
index 43e351799381236e00df5d9f35b4ecc7605f514e..4ed627826b4bc398e79a539d592e2811126308e8 100644 (file)
@@ -631,6 +631,15 @@ EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
    Filter: (f1 ~ '^(?:[^^]\^)$'::text)
 (2 rows)
 
+-- Closing square bracket after an escape sequence at the beginning of
+-- a character closes the character class
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[|a]%' ESCAPE '|';
+              QUERY PLAN               
+---------------------------------------
+ Seq Scan on text_tbl
+   Filter: (f1 ~ '^(?:[\a].*)$'::text)
+(2 rows)
+
 -- Test back reference in regexp_replace
 SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
  regexp_replace 
index 481166276573448197c08ef28eccf149f57e9e0d..0fe6d05be5fd7c2987f2b66cb68ff135b2203c59 100644 (file)
@@ -204,6 +204,9 @@ EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
 -- Closing square bracket effective after two carets at the beginning
 -- of character class.
 EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
+-- Closing square bracket after an escape sequence at the beginning of
+-- a character closes the character class
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[|a]%' ESCAPE '|';
 
 -- Test back reference in regexp_replace
 SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');