]> git.ipfire.org Git - thirdparty/postgresql.git/commitdiff
Fix conversion of SIMILAR TO regexes for character classes
authorMichael Paquier <michael@paquier.xyz>
Tue, 27 May 2025 23:59:27 +0000 (08:59 +0900)
committerMichael Paquier <michael@paquier.xyz>
Tue, 27 May 2025 23:59:27 +0000 (08:59 +0900)
The code that translates SIMILAR TO pattern matching expressions to
POSIX-style regular expressions did not consider that square brackets
can be nested.  For example, in an expression like [[:alpha:]%_], the
logic replaced the placeholders '_' and '%' but it should not.

This commit fixes the conversion logic by tracking the nesting level of
square brackets marking character class areas, while considering that
in expressions like []] or [^]] the first closing square bracket is a
regular character.  Multiple tests are added to show how the conversions
should or should not apply applied while in a character class area, with
specific cases added for all the characters converted outside character
classes like an opening parenthesis '(', dollar sign '$', etc.

Author: Laurenz Albe <laurenz.albe@cybertec.at>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at
Backpatch-through: 13

src/backend/utils/adt/regexp.c
src/test/regress/expected/strings.out
src/test/regress/sql/strings.sql

index a32c5c82ab4377f7d68de93e0c6759a137c9af3f..24058a4bc93790c16f475eb6987dabe0fe7d8a67 100644 (file)
@@ -673,8 +673,11 @@ similar_escape_internal(text *pat_text, text *esc_text)
        int                     plen,
                                elen;
        bool            afterescape = false;
-       bool            incharclass = false;
        int                     nquotes = 0;
+       int                     charclass_depth = 0;    /* Nesting level of character classes,
+                                                                                * encompassed by square brackets */
+       int                     charclass_start = 0;    /* State of the character class start,
+                                                                                * for carets */
 
        p = VARDATA_ANY(pat_text);
        plen = VARSIZE_ANY_EXHDR(pat_text);
@@ -804,7 +807,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
                /* fast path */
                if (afterescape)
                {
-                       if (pchar == '"' && !incharclass)       /* escape-double-quote? */
+                       if (pchar == '"' && charclass_depth < 1)        /* escape-double-quote? */
                        {
                                /* emit appropriate part separator, per notes above */
                                if (nquotes == 0)
@@ -853,18 +856,41 @@ similar_escape_internal(text *pat_text, text *esc_text)
                        /* SQL escape character; do not send to output */
                        afterescape = true;
                }
-               else if (incharclass)
+               else if (charclass_depth > 0)
                {
                        if (pchar == '\\')
                                *r++ = '\\';
                        *r++ = pchar;
-                       if (pchar == ']')
-                               incharclass = false;
+
+                       /*
+                        * Ignore a closing bracket at the start of a character class.
+                        * Such a bracket is taken literally rather than closing the
+                        * class.  "charclass_start" is 1 right at the beginning of a
+                        * class and 2 after an initial caret.
+                        */
+                       if (pchar == ']' && charclass_start > 2)
+                               charclass_depth--;
+                       else if (pchar == '[')
+                               charclass_depth++;
+
+                       /*
+                        * If there is a caret right after the opening bracket, it negates
+                        * the character class, but a following closing bracket should
+                        * still be treated as a normal character.  That holds only for
+                        * the first caret, so only the values 1 and 2 mean that closing
+                        * brackets should be taken literally.
+                        */
+                       if (pchar == '^')
+                               charclass_start++;
+                       else
+                               charclass_start = 3;    /* definitely past the start */
                }
                else if (pchar == '[')
                {
+                       /* start of a character class */
                        *r++ = pchar;
-                       incharclass = true;
+                       charclass_depth++;
+                       charclass_start = 1;
                }
                else if (pchar == '%')
                {
index 99b0eb3f688dda352451bce29c198507fee3b44c..8d4ba17628faa55efa9a6bb7930b58e82669561b 100644 (file)
@@ -564,6 +564,68 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
 SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
 ERROR:  invalid escape string
 HINT:  Escape string must be empty or one character.
+-- Characters that should be left alone in character classes when a
+-- SIMILAR TO regexp pattern is converted to POSIX style.
+-- Underscore "_"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
+                   QUERY PLAN                   
+------------------------------------------------
+ Seq Scan on text_tbl
+   Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text)
+(2 rows)
+
+-- Percentage "%"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Seq Scan on text_tbl
+   Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text)
+(2 rows)
+
+-- Dot "."
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Seq Scan on text_tbl
+   Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text)
+(2 rows)
+
+-- Dollar "$"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Seq Scan on text_tbl
+   Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text)
+(2 rows)
+
+-- Opening parenthesis "("
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
+ERROR:  invalid regular expression: parentheses () not balanced
+-- Caret "^"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Seq Scan on text_tbl
+   Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text)
+(2 rows)
+
+-- Closing square bracket "]" at the beginning of character class
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
+                   QUERY PLAN                   
+------------------------------------------------
+ Seq Scan on text_tbl
+   Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text)
+(2 rows)
+
+-- Closing square bracket effective after two carets at the beginning
+-- of character class.
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
+              QUERY PLAN               
+---------------------------------------
+ Seq Scan on text_tbl
+   Filter: (f1 ~ '^(?:[^^]\^)$'::text)
+(2 rows)
+
 -- Test back reference in regexp_replace
 SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
  regexp_replace 
index 63ddce67c54b7afb472d9ae15685e3051a9dd9b1..190d6c110f6c56bf97b0f8b40bc60546fdbc5ed7 100644 (file)
@@ -185,6 +185,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
 SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
 SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
 
+-- Characters that should be left alone in character classes when a
+-- SIMILAR TO regexp pattern is converted to POSIX style.
+-- Underscore "_"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
+-- Percentage "%"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
+-- Dot "."
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
+-- Dollar "$"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
+-- Opening parenthesis "("
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
+-- Caret "^"
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
+-- Closing square bracket "]" at the beginning of character class
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
+-- Closing square bracket effective after two carets at the beginning
+-- of character class.
+EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
+
 -- Test back reference in regexp_replace
 SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
 SELECT regexp_replace('AAA   BBB   CCC   ', E'\\s+', ' ', 'g');