From: Mike Bayer <mike_mp@zzzcomputing.com>
Date: Thu, 19 Mar 2020 16:32:47 +0000 (-0400)
Subject: Use looser tokenization for type comparison
X-Git-Tag: rel_1_4_2~1^2
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=52ef05c936170109f2f457751f4940e4269595c3;p=thirdparty%2Fsqlalchemy%2Falembic.git

Use looser tokenization for type comparison

Fixed more false-positive failures produced by the new "compare type" logic
first added in :ticket:`605`, particularly impacting MySQL string types
regarding flags such as "charset" and "collation".

Change-Id: If3dcfed7b0aa2d7367e1508289a0b5ee6f4c15a7
Fixes: #617
---

diff --git a/alembic/ddl/impl.py b/alembic/ddl/impl.py
index 655d8c94..9df2a72d 100644
--- a/alembic/ddl/impl.py
+++ b/alembic/ddl/impl.py
@@ -22,8 +22,7 @@ class ImplMeta(type):
 
 _impls = {}
 
-
-Params = namedtuple("Params", ["type", "args", "kwargs"])
+Params = namedtuple("Params", ["token0", "tokens", "args", "kwargs"])
 
 
 class DefaultImpl(with_metaclass(ImplMeta)):
@@ -45,6 +44,7 @@ class DefaultImpl(with_metaclass(ImplMeta)):
     transactional_ddl = False
     command_terminator = ";"
     type_synonyms = ({"NUMERIC", "DECIMAL"},)
+    type_arg_extract = ()
 
     def __init__(
         self,
@@ -339,33 +339,50 @@ class DefaultImpl(with_metaclass(ImplMeta)):
         # TIMESTAMP WITH TIMEZONE
         # INTEGER UNSIGNED
         # INTEGER (10) UNSIGNED
+        # INTEGER(10) UNSIGNED
+        # varchar character set utf8
         #
-        # "ext" are the words after the parenthesis, if any, but if there
-        # are no parenthesis, then these are part of "col".  so they are
-        # moved together for normalization purposes
-        matches = re.search(
-            r"^(?P<col>[^()]*)(?:\((?P<params>.*?)\))?(?P<ext>[^()]*)?$",
-            definition,
-        ).groupdict(default="")
-        col_type = matches["col"]
-        if matches["ext"]:
-            col_type = col_type.strip() + " " + matches["ext"].strip()
-
-        params = Params(col_type, [], {})
-        for term in re.findall("[^(),]+", matches["params"] or ""):
-            if "=" in term:
-                key, val = term.split("=")
-                params.kwargs[key.strip()] = val.strip()
+
+        tokens = re.findall(r"[\w\-_]+|\(.+?\)", definition)
+
+        term_tokens = []
+        paren_term = None
+
+        for token in tokens:
+            if re.match(r"^\(.*\)$", token):
+                paren_term = token
             else:
-                params.args.append(term.strip())
+                term_tokens.append(token)
+
+        params = Params(term_tokens[0], term_tokens[1:], [], {})
+
+        if paren_term:
+            for term in re.findall("[^(),]+", paren_term):
+                if "=" in term:
+                    key, val = term.split("=")
+                    params.kwargs[key.strip()] = val.strip()
+                else:
+                    params.args.append(term.strip())
+
         return params
 
-    def _column_types_match(self, inspector_type, metadata_type):
-        if inspector_type == metadata_type:
+    def _column_types_match(self, inspector_params, metadata_params):
+        if inspector_params.token0 == metadata_params.token0:
             return True
+
         synonyms = [{t.lower() for t in batch} for batch in self.type_synonyms]
+        inspector_all_terms = " ".join(
+            [inspector_params.token0] + inspector_params.tokens
+        )
+        metadata_all_terms = " ".join(
+            [metadata_params.token0] + metadata_params.tokens
+        )
+
         for batch in synonyms:
-            if {inspector_type, metadata_type}.issubset(batch):
+            if {inspector_all_terms, metadata_all_terms}.issubset(batch) or {
+                inspector_params.token0,
+                metadata_params.token0,
+            }.issubset(batch):
                 return True
         return False
 
@@ -375,22 +392,27 @@ class DefaultImpl(with_metaclass(ImplMeta)):
         we want to make sure they are the same. However, if only one
         specifies it, dont flag it for being less specific
         """
-        # py27- .keys is a set in py36
-        for kw in set(inspected_params.kwargs.keys()) & set(
-            meta_params.kwargs.keys()
+
+        if (
+            len(meta_params.tokens) == len(inspected_params.tokens)
+            and meta_params.tokens != inspected_params.tokens
         ):
-            if inspected_params.kwargs[kw] != meta_params.kwargs[kw]:
-                return False
+            return False
+
+        if (
+            len(meta_params.args) == len(inspected_params.args)
+            and meta_params.args != inspected_params.args
+        ):
+            return False
+
+        insp = " ".join(inspected_params.tokens).lower()
+        meta = " ".join(meta_params.tokens).lower()
+
+        for reg in self.type_arg_extract:
+            mi = re.search(reg, insp)
+            mm = re.search(reg, meta)
 
-        # compare the positional arguments to the extent that they are
-        # present in the SQL form of the metadata type compared to
-        # the inspected type.   Example:  Oracle NUMBER(17) and NUMBER(17, 0)
-        # are equivalent.  As it's not clear if some database types might
-        # ignore additional parameters vs. they aren't actually there and
-        # need to be added, for now we are still favoring avoiding false
-        # positives
-        for meta, inspect in zip(meta_params.args, inspected_params.args):
-            if meta != inspect:
+            if mi and mm and mi.group(1) != mm.group(1):
                 return False
 
         return True
@@ -402,9 +424,8 @@ class DefaultImpl(with_metaclass(ImplMeta)):
         """
         inspector_params = self._tokenize_column_type(inspector_column)
         metadata_params = self._tokenize_column_type(metadata_column)
-        if not self._column_types_match(
-            inspector_params.type, metadata_params.type
-        ):
+
+        if not self._column_types_match(inspector_params, metadata_params,):
             return True
         if not self._column_args_match(inspector_params, metadata_params):
             return True
diff --git a/alembic/ddl/mssql.py b/alembic/ddl/mssql.py
index 3ccbbff0..78a7eb65 100644
--- a/alembic/ddl/mssql.py
+++ b/alembic/ddl/mssql.py
@@ -26,6 +26,8 @@ class MSSQLImpl(DefaultImpl):
     transactional_ddl = True
     batch_separator = "GO"
 
+    type_synonyms = DefaultImpl.type_synonyms + ({"VARCHAR", "NVARCHAR"},)
+
     def __init__(self, *arg, **kw):
         super(MSSQLImpl, self).__init__(*arg, **kw)
         self.batch_separator = self.context_opts.get(
diff --git a/alembic/ddl/mysql.py b/alembic/ddl/mysql.py
index a66bb84f..02ce2536 100644
--- a/alembic/ddl/mysql.py
+++ b/alembic/ddl/mysql.py
@@ -25,6 +25,7 @@ class MySQLImpl(DefaultImpl):
 
     transactional_ddl = False
     type_synonyms = DefaultImpl.type_synonyms + ({"BOOL", "TINYINT"},)
+    type_arg_extract = [r"character set ([\w\-_]+)", r"collate ([\w\-_]+)"]
 
     def alter_column(
         self,
diff --git a/docs/build/unreleased/671.rst b/docs/build/unreleased/671.rst
new file mode 100644
index 00000000..635faaee
--- /dev/null
+++ b/docs/build/unreleased/671.rst
@@ -0,0 +1,7 @@
+.. change::
+    :tags: bug, autogenerate, mysql
+    :tickets: 617
+
+    Fixed more false-positive failures produced by the new "compare type" logic
+    first added in :ticket:`605`, particularly impacting MySQL string types
+    regarding flags such as "charset" and "collation".
diff --git a/tests/requirements.py b/tests/requirements.py
index bd258e08..d6794a54 100644
--- a/tests/requirements.py
+++ b/tests/requirements.py
@@ -98,6 +98,22 @@ class DefaultRequirements(SuiteRequirements):
 
         return exclusions.only_on(["postgresql"])
 
+    @property
+    def postgresql(self):
+        return exclusions.only_on(["postgresql"])
+
+    @property
+    def mysql(self):
+        return exclusions.only_on(["mysql"])
+
+    @property
+    def oracle(self):
+        return exclusions.only_on(["oracle"])
+
+    @property
+    def mssql(self):
+        return exclusions.only_on(["mssql"])
+
     @property
     def postgresql_uuid_ossp(self):
         def check_uuid_ossp(config):
diff --git a/tests/test_autogen_diffs.py b/tests/test_autogen_diffs.py
index e1e5c8d4..ad387655 100644
--- a/tests/test_autogen_diffs.py
+++ b/tests/test_autogen_diffs.py
@@ -786,6 +786,8 @@ class CompareMetadataToInspectorTest(TestBase):
         # that have not changed.
         is_(self._compare_columns(cola, cola), False)
 
+    # TODO: ideally the backend-specific types would be tested
+    # within the test suites for those backends.
     @testing.combinations(
         (String(32), VARCHAR(32), False),
         (VARCHAR(6), String(6), False),
@@ -795,6 +797,40 @@ class CompareMetadataToInspectorTest(TestBase):
         (Unicode(32), VARCHAR(32), False, config.requirements.unicode_string),
         (VARCHAR(6), VARCHAR(12), True),
         (VARCHAR(6), String(12), True),
+        (Integer(), String(10), True),
+        (String(10), Integer(), True),
+        (
+            Unicode(30, collation="en_US"),
+            Unicode(30, collation="en_US"),
+            False,  # unfortunately dialects don't seem to consistently
+            # reflect collations right now so we can't test for
+            # positives here
+            config.requirements.postgresql,
+        ),
+        (
+            mysql.VARCHAR(200, charset="utf8"),
+            Unicode(200),
+            False,
+            config.requirements.mysql,
+        ),
+        (
+            mysql.VARCHAR(200, charset="latin1"),
+            mysql.VARCHAR(200, charset="utf-8"),
+            True,
+            config.requirements.mysql + config.requirements.sqlalchemy_13,
+        ),
+        (
+            String(255, collation="utf8_bin"),
+            String(255),
+            False,
+            config.requirements.mysql,
+        ),
+        (
+            String(255, collation="utf8_bin"),
+            String(255, collation="latin1_bin"),
+            True,
+            config.requirements.mysql + config.requirements.sqlalchemy_13,
+        ),
     )
     def test_string_comparisons(self, cola, colb, expect_changes):
         is_(self._compare_columns(cola, colb), expect_changes)