From: Mike Bayer <mike_mp@zzzcomputing.com>
Date: Sat, 5 Oct 2019 22:27:44 +0000 (-0400)
Subject: create second level deduping when use_labels is turned on
X-Git-Tag: rel_1_4_0b1~695^2
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e0396633e72bc09bd7cec715101d516ea87fa840;p=thirdparty%2Fsqlalchemy%2Fsqlalchemy.git

create second level deduping when use_labels is turned on

As of #4753 we allow duplicate columns.  This creates some new
problems that there can be duplicate columns in a subquery
which are then not addressible on the outside because they
are ambiguous (Postgresql has this behavior at least).  Additionally
it creates situations where we are making an anon label of an
anon label which is leaking into the query.

New logic for generating anon labels handles this situation and
also alters the .c collection
of a subquery such that we are only getting the first column
from the derived selectable that has that name, the subsequent ones
have a new deduping label with two underscores and are not exposed
in .c.  The dedupe logic when rendering the columns will handle
duplicate label names for different columns, vs. the same column
repeated, as separate cases.

Fixes: #4892
Change-Id: I929fbd8da14bcc239e0481c24bbd9b5ce826e8fa
---

diff --git a/lib/sqlalchemy/sql/elements.py b/lib/sqlalchemy/sql/elements.py
index 8ee157b6f8..b6462b3347 100644
--- a/lib/sqlalchemy/sql/elements.py
+++ b/lib/sqlalchemy/sql/elements.py
@@ -878,7 +878,13 @@ class ColumnElement(
         while self._is_clone_of is not None:
             self = self._is_clone_of
 
-        return _anonymous_label("%%(%d %s)s" % (id(self), seed or "anon"))
+        # as of 1.4 anonymous label for ColumnElement uses hash(), not id(),
+        # as the identifier, because a column and its annotated version are
+        # the same thing in a SQL statement
+        if isinstance(seed, _anonymous_label):
+            return _anonymous_label("%s%%(%d %s)s" % (seed, hash(self), ""))
+
+        return _anonymous_label("%%(%d %s)s" % (hash(self), seed or "anon"))
 
     @util.memoized_property
     def anon_label(self):
@@ -900,6 +906,10 @@ class ColumnElement(
     def _label_anon_label(self):
         return self._anon_label(getattr(self, "_label", None))
 
+    @util.memoized_property
+    def _dedupe_label_anon_label(self):
+        return self._anon_label(getattr(self, "_label", "anon") + "_")
+
 
 class WrapsColumnExpression(object):
     """Mixin that defines a :class:`.ColumnElement` as a wrapper with special
diff --git a/lib/sqlalchemy/sql/selectable.py b/lib/sqlalchemy/sql/selectable.py
index 33ba95717c..ddbcdf91d6 100644
--- a/lib/sqlalchemy/sql/selectable.py
+++ b/lib/sqlalchemy/sql/selectable.py
@@ -4174,33 +4174,78 @@ class Select(
             (name_for_col(c), c) for c in cols
         ).as_immutable()
 
-    @_memoized_property
-    def _columns_plus_names(self):
+    def _generate_columns_plus_names(self, anon_for_dupe_key):
         cols = _select_iterables(self._raw_columns)
 
+        # when use_labels is on:
+        # in all cases == if we see the same label name, use _label_anon_label
+        # for subsequent occurences of that label
+        #
+        # anon_for_dupe_key == if we see the same column object multiple
+        # times under a particular name, whether it's the _label name or the
+        # anon label, apply _dedupe_label_anon_label to the subsequent
+        # occurrences of it.
+
         if self.use_labels:
-            names = set()
+            names = {}
 
             def name_for_col(c):
                 if c._label is None or not c._render_label_in_columns_clause:
                     return (None, c)
 
                 name = c._label
+
                 if name in names:
-                    name = c._label_anon_label
+                    # when looking to see if names[name] is the same column as
+                    # c, use hash(), so that an annotated version of the column
+                    # is seen as the same as the non-annotated
+                    if hash(names[name]) != hash(c):
+
+                        # different column under the same name.  apply
+                        # disambiguating label
+                        name = c._label_anon_label
+
+                        if anon_for_dupe_key and name in names:
+                            # here, c._label_anon_label is definitely unique to
+                            # that column identity (or annotated version), so
+                            # this should always be true.
+                            # this is also an infrequent codepath because
+                            # you need two levels of duplication to be here
+                            assert hash(names[name]) == hash(c)
+
+                            # the column under the disambiguating label is
+                            # already present.  apply the "dedupe" label to
+                            # subsequent occurrences of the column so that the
+                            # original stays non-ambiguous
+                            name = c._dedupe_label_anon_label
+                        else:
+                            names[name] = c
+                    elif anon_for_dupe_key:
+                        # same column under the same name. apply the "dedupe"
+                        # label so that the original stays non-ambiguous
+                        name = c._dedupe_label_anon_label
                 else:
-                    names.add(name)
+                    names[name] = c
                 return name, c
 
             return [name_for_col(c) for c in cols]
         else:
             return [(None, c) for c in cols]
 
+    @_memoized_property
+    def _columns_plus_names(self):
+        """generate label names plus columns to render in a SELECT."""
+
+        return self._generate_columns_plus_names(True)
+
     def _generate_fromclause_column_proxies(self, subquery):
+        """generate column proxies to place in the exported .c collection
+        of a subquery."""
+
         keys_seen = set()
         prox = []
 
-        for name, c in self._columns_plus_names:
+        for name, c in self._generate_columns_plus_names(False):
             if not hasattr(c, "_make_proxy"):
                 continue
             if name is None:
diff --git a/test/sql/test_compiler.py b/test/sql/test_compiler.py
index 36e9cd33bb..ca73f6c18c 100644
--- a/test/sql/test_compiler.py
+++ b/test/sql/test_compiler.py
@@ -571,16 +571,185 @@ class SelectTest(fixtures.TestBase, AssertsCompiledSQL):
         s = s.compile(dialect=default.DefaultDialect(paramstyle="qmark"))
         eq_(s.positiontup, ["a", "b", "c"])
 
+    def test_overlapping_labels_use_labels(self):
+        foo = table("foo", column("id"), column("bar_id"))
+        foo_bar = table("foo_bar", column("id"))
+
+        stmt = select([foo, foo_bar]).apply_labels()
+        self.assert_compile(
+            stmt,
+            "SELECT foo.id AS foo_id, foo.bar_id AS foo_bar_id, "
+            "foo_bar.id AS foo_bar_id_1 "
+            "FROM foo, foo_bar",
+        )
+
+    def test_overlapping_labels_plus_dupes_use_labels(self):
+        foo = table("foo", column("id"), column("bar_id"))
+        foo_bar = table("foo_bar", column("id"))
+
+        # current approach is:
+        # 1. positional nature of columns is always maintained in all cases
+        # 2. two different columns that have the same label, second one
+        #    is disambiguated
+        # 3. if the same column is repeated, it gets deduped using a special
+        #    'dedupe' label that will show two underscores
+        # 4. The disambiguating label generated in #2 also has to be deduped.
+        # 5. The derived columns, e.g. subquery().c etc. do not export the
+        #    "dedupe" columns, at all.  they are unreachable (because they
+        #    are unreachable anyway in SQL unless you use "SELECT *")
+        #
+        # this is all new logic necessitated by #4753 since we allow columns
+        # to be repeated.   We would still like the targeting of this column,
+        # both in a result set as well as in a derived selectable, to be
+        # unambiguous (DBs like postgresql won't let us reference an ambiguous
+        # label in a derived selectable even if its the same column repeated).
+        #
+        # this kind of thing happens of course because the ORM is in some
+        # more exotic cases writing in joins where columns may be duped.
+        # it might be nice to fix it on that side also, however SQLAlchemy
+        # has deduped columns in SELECT statements for 13 years so having a
+        # robust behavior when dupes are present is still very useful.
+
+        stmt = select(
+            [
+                foo.c.id,
+                foo.c.bar_id,
+                foo_bar.c.id,
+                foo.c.bar_id,
+                foo.c.id,
+                foo.c.bar_id,
+                foo_bar.c.id,
+                foo_bar.c.id,
+            ]
+        ).apply_labels()
+        self.assert_compile(
+            stmt,
+            "SELECT foo.id AS foo_id, "
+            "foo.bar_id AS foo_bar_id, "  # 1. 1st foo.bar_id, as is
+            "foo_bar.id AS foo_bar_id_1, "  # 2. 1st foo_bar.id, disamb from 1
+            "foo.bar_id AS foo_bar_id__1, "  # 3. 2nd foo.bar_id, dedupe from 1
+            "foo.id AS foo_id__1, "
+            "foo.bar_id AS foo_bar_id__1, "  # 4. 3rd foo.bar_id, same as 3
+            "foo_bar.id AS foo_bar_id__2, "  # 5. 2nd foo_bar.id
+            "foo_bar.id AS foo_bar_id__2 "  # 6. 3rd foo_bar.id, same as 5
+            "FROM foo, foo_bar",
+        )
+
+        # for the subquery, the labels created for repeated occurrences
+        # of the same column are not used.  only the label applied to the
+        # first occurrence of each column is used
+        self.assert_compile(
+            select([stmt.subquery()]),
+            "SELECT "
+            "anon_1.foo_id, "  # from 1st foo.id in derived (line 1)
+            "anon_1.foo_bar_id, "  # from 1st foo.bar_id in derived (line 2)
+            "anon_1.foo_bar_id_1, "  # from 1st foo_bar.id in derived (line 3)
+            "anon_1.foo_bar_id, "  # from 1st foo.bar_id in derived (line 2)
+            "anon_1.foo_id, "  # from 1st foo.id in derived (line 1)
+            "anon_1.foo_bar_id, "  # from 1st foo.bar_id in derived (line 2)
+            "anon_1.foo_bar_id_1, "  # from 1st foo_bar.id in derived (line 3)
+            "anon_1.foo_bar_id_1 "  # from 1st foo_bar.id in derived (line 3)
+            "FROM ("
+            "SELECT foo.id AS foo_id, "
+            "foo.bar_id AS foo_bar_id, "  # 1. 1st foo.bar_id, as is
+            "foo_bar.id AS foo_bar_id_1, "  # 2. 1st foo_bar.id, disamb from 1
+            "foo.bar_id AS foo_bar_id__1, "  # 3. 2nd foo.bar_id, dedupe from 1
+            "foo.id AS foo_id__1, "
+            "foo.bar_id AS foo_bar_id__1, "  # 4. 3rd foo.bar_id, same as 3
+            "foo_bar.id AS foo_bar_id__2, "  # 5. 2nd foo_bar.id
+            "foo_bar.id AS foo_bar_id__2 "  # 6. 3rd foo_bar.id, same as 5
+            "FROM foo, foo_bar"
+            ") AS anon_1",
+        )
+
     def test_dupe_columns_use_labels(self):
-        """as of 1.4, there's no deduping.
+        t = table("t", column("a"), column("b"))
+        self.assert_compile(
+            select([t.c.a, t.c.a, t.c.b, t.c.a]).apply_labels(),
+            "SELECT t.a AS t_a, t.a AS t_a__1, t.b AS t_b, "
+            "t.a AS t_a__1 FROM t",
+        )
 
-        however the labels will still uniqify themselves...
-        """
+    def test_dupe_columns_use_labels_derived_selectable(self):
+        t = table("t", column("a"), column("b"))
+        stmt = select([t.c.a, t.c.a, t.c.b, t.c.a]).apply_labels().subquery()
+
+        self.assert_compile(
+            select([stmt]),
+            "SELECT anon_1.t_a, anon_1.t_a, anon_1.t_b, anon_1.t_a FROM "
+            "(SELECT t.a AS t_a, t.a AS t_a__1, t.b AS t_b, t.a AS t_a__1 "
+            "FROM t) AS anon_1",
+        )
 
+    def test_dupe_columns_use_labels_mix_annotations(self):
         t = table("t", column("a"), column("b"))
+        a, b, a_a = t.c.a, t.c.b, t.c.a._annotate({"some_orm_thing": True})
+
+        self.assert_compile(
+            select([a, a_a, b, a_a]).apply_labels(),
+            "SELECT t.a AS t_a, t.a AS t_a__1, t.b AS t_b, "
+            "t.a AS t_a__1 FROM t",
+        )
+
+        self.assert_compile(
+            select([a_a, a, b, a_a]).apply_labels(),
+            "SELECT t.a AS t_a, t.a AS t_a__1, t.b AS t_b, "
+            "t.a AS t_a__1 FROM t",
+        )
+
+        self.assert_compile(
+            select([a_a, a_a, b, a]).apply_labels(),
+            "SELECT t.a AS t_a, t.a AS t_a__1, t.b AS t_b, "
+            "t.a AS t_a__1 FROM t",
+        )
+
+    def test_dupe_columns_use_labels_derived_selectable_mix_annotations(self):
+        t = table("t", column("a"), column("b"))
+        a, b, a_a = t.c.a, t.c.b, t.c.a._annotate({"some_orm_thing": True})
+        stmt = select([a, a_a, b, a_a]).apply_labels().subquery()
+
+        self.assert_compile(
+            select([stmt]),
+            "SELECT anon_1.t_a, anon_1.t_a, anon_1.t_b, anon_1.t_a FROM "
+            "(SELECT t.a AS t_a, t.a AS t_a__1, t.b AS t_b, t.a AS t_a__1 "
+            "FROM t) AS anon_1",
+        )
+
+    def test_overlapping_labels_plus_dupes_use_labels_mix_annotations(self):
+        foo = table("foo", column("id"), column("bar_id"))
+        foo_bar = table("foo_bar", column("id"))
+
+        foo_bar__id = foo_bar.c.id._annotate({"some_orm_thing": True})
+
+        stmt = select(
+            [
+                foo.c.bar_id,
+                foo_bar.c.id,
+                foo_bar.c.id,
+                foo_bar__id,
+                foo_bar__id,
+            ]
+        ).apply_labels()
+
+        self.assert_compile(
+            stmt,
+            "SELECT foo.bar_id AS foo_bar_id, foo_bar.id AS foo_bar_id_1, "
+            "foo_bar.id AS foo_bar_id__1, foo_bar.id AS foo_bar_id__1, "
+            "foo_bar.id AS foo_bar_id__1 FROM foo, foo_bar",
+        )
+
+    def test_dupe_columns_use_labels_from_anon(self):
+
+        t = table("t", column("a"), column("b"))
+        a = t.alias()
+
+        # second and third occurrences of a.c.a are labeled, but are
+        # dupes of each other.
         self.assert_compile(
-            select([t.c.a, t.c.a, t.c.b]).apply_labels(),
-            "SELECT t.a AS t_a, t.a AS t_a_1, t.b AS t_b FROM t",
+            select([a.c.a, a.c.a, a.c.b, a.c.a]).apply_labels(),
+            "SELECT t_1.a AS t_1_a, t_1.a AS t_1_a__1, t_1.b AS t_1_b, "
+            "t_1.a AS t_1_a__1 "
+            "FROM t AS t_1",
         )
 
     def test_nested_label_targeting(self):
diff --git a/test/sql/test_join_rewriting.py b/test/sql/test_join_rewriting.py
index 573455e7db..e713f5d73b 100644
--- a/test/sql/test_join_rewriting.py
+++ b/test/sql/test_join_rewriting.py
@@ -362,7 +362,7 @@ class JoinRewriteTest(_JoinRewriteTestBase, fixtures.TestBase):
     _a_bc_wdupes = (
         "SELECT a.id AS a_id, anon_1.b_id AS b_id, anon_1.b_a_id AS b_a_id, "
         "anon_1.c_id AS c_id, anon_1.c_b_id AS c_b_id, "
-        "anon_1.b_a_id AS b_a_id_1, anon_1.c_b_id AS c_b_id_1 "
+        "anon_1.b_a_id AS b_a_id__1, anon_1.c_b_id AS c_b_id__1 "
         "FROM a JOIN "
         "(SELECT b.id AS b_id, b.a_id AS b_a_id, c.id AS c_id, "
         "c.b_id AS c_b_id "
@@ -373,19 +373,10 @@ class JoinRewriteTest(_JoinRewriteTestBase, fixtures.TestBase):
 
     _a_bc_wdupes_anon_map = (
         "SELECT 1 FROM (SELECT a.id AS a_id, b.id AS b_id, b.a_id AS b_a_id, "
-        "c.id AS c_id, c.b_id AS c_b_id, b.a_id AS b_a_id_1, "
-        "c.b_id AS c_b_id_1 "
-        "FROM a JOIN (b JOIN c ON b.id = c.b_id) ON a.id = b.a_id "
-        "WHERE b.id = :id_1 AND c.id = :id_2) AS anon_1 "
-        "WHERE anon_1.b_a_id_1 = anon_1.c_b_id_1"
-    )
-
-    _a_bc_wdupes_anon_map = (
-        "SELECT 1 FROM (SELECT a.id AS a_id, b.id AS b_id, b.a_id AS b_a_id, "
-        "c.id AS c_id, c.b_id AS c_b_id, b.a_id AS b_a_id_1, "
-        "c.b_id AS c_b_id_1 FROM a JOIN (b JOIN c ON b.id = c.b_id) "
+        "c.id AS c_id, c.b_id AS c_b_id, b.a_id AS b_a_id__1, "
+        "c.b_id AS c_b_id__1 FROM a JOIN (b JOIN c ON b.id = c.b_id) "
         "ON a.id = b.a_id WHERE b.id = :id_1 AND c.id = :id_2) AS anon_1 "
-        "WHERE anon_1.b_a_id_1 = anon_1.c_b_id_1"
+        "WHERE anon_1.b_a_id = anon_1.c_b_id"
     )
 
     _a_bc_comma_a1_selbc = (
@@ -477,7 +468,7 @@ class JoinRewriteTest(_JoinRewriteTestBase, fixtures.TestBase):
     _b_a_id_double_overlap_annotated = (
         "SELECT anon_1.b_id AS anon_1_b_id, anon_1.b_a_id AS anon_1_b_a_id, "
         "anon_1.b_a_id_1 AS anon_1_b_a_id_1 "
-        "FROM (SELECT b.id AS b_id, b.a_id AS b_a_id, b_a.id AS b_a_id_2 "
+        "FROM (SELECT b.id AS b_id, b.a_id AS b_a_id, b_a.id AS b_a_id_1 "
         "FROM b JOIN b_a ON b.id = b_a.id) AS anon_1"
     )
 
@@ -534,7 +525,7 @@ class JoinPlainTest(_JoinRewriteTestBase, fixtures.TestBase):
 
     _a_bc_wdupes = (
         "SELECT a.id AS a_id, b.id AS b_id, b.a_id AS b_a_id, c.id AS c_id, "
-        "c.b_id AS c_b_id, b.a_id AS b_a_id_1, c.b_id AS c_b_id_1 "
+        "c.b_id AS c_b_id, b.a_id AS b_a_id__1, c.b_id AS c_b_id__1 "
         "FROM a JOIN "
         "(b JOIN c ON b.id = c.b_id) "
         "ON a.id = b.a_id "
@@ -544,11 +535,11 @@ class JoinPlainTest(_JoinRewriteTestBase, fixtures.TestBase):
 
     _a_bc_wdupes_anon_map = (
         "SELECT 1 FROM (SELECT a.id AS a_id, b.id AS b_id, b.a_id AS b_a_id, "
-        "c.id AS c_id, c.b_id AS c_b_id, b.a_id AS b_a_id_1, "
-        "c.b_id AS c_b_id_1 "
+        "c.id AS c_id, c.b_id AS c_b_id, b.a_id AS b_a_id__1, "
+        "c.b_id AS c_b_id__1 "
         "FROM a JOIN (b JOIN c ON b.id = c.b_id) ON a.id = b.a_id "
         "WHERE b.id = :id_1 AND c.id = :id_2) AS anon_1 "
-        "WHERE anon_1.b_a_id_1 = anon_1.c_b_id_1"
+        "WHERE anon_1.b_a_id = anon_1.c_b_id"
     )
 
     _a_bc_comma_a1_selbc = (
@@ -680,11 +671,11 @@ class JoinNoUseLabelsTest(_JoinRewriteTestBase, fixtures.TestBase):
 
     _a_bc_wdupes_anon_map = (
         "SELECT 1 FROM (SELECT a.id AS a_id, b.id AS b_id, b.a_id AS b_a_id, "
-        "c.id AS c_id, c.b_id AS c_b_id, b.a_id AS b_a_id_1, "
-        "c.b_id AS c_b_id_1 "
+        "c.id AS c_id, c.b_id AS c_b_id, b.a_id AS b_a_id__1, "
+        "c.b_id AS c_b_id__1 "
         "FROM a JOIN (b JOIN c ON b.id = c.b_id) ON a.id = b.a_id "
         "WHERE b.id = :id_1 AND c.id = :id_2) AS anon_1 "
-        "WHERE anon_1.b_a_id_1 = anon_1.c_b_id_1"
+        "WHERE anon_1.b_a_id = anon_1.c_b_id"
     )
 
     _a_bc_comma_a1_selbc = (
diff --git a/test/sql/test_labels.py b/test/sql/test_labels.py
index 638d4ee7f1..e953e71a1a 100644
--- a/test/sql/test_labels.py
+++ b/test/sql/test_labels.py
@@ -924,9 +924,9 @@ class ColExprLabelTest(fixtures.TestBase, AssertsCompiledSQL):
                 ]
             ).apply_labels(),
             "SELECT some_table.name AS some_table_name, "
-            "some_table.name AS some_table_name_1, "
-            "SOME_COL_THING(some_table.name) AS some_table_name_2, "
-            "SOME_COL_THING(some_table.name) AS some_table_name_3 "
+            "some_table.name AS some_table_name__1, "
+            "SOME_COL_THING(some_table.name) AS some_table_name_1, "
+            "SOME_COL_THING(some_table.name) AS some_table_name_2 "
             "FROM some_table",
         )
 
diff --git a/test/sql/test_selectable.py b/test/sql/test_selectable.py
index 76cece5e40..00b9b68c7e 100644
--- a/test/sql/test_selectable.py
+++ b/test/sql/test_selectable.py
@@ -2735,6 +2735,7 @@ class WithLabelsTest(fixtures.TestBase):
         eq_(
             list(sel.subquery().c.keys()),
             ["t_x_id", t2.c.id._label_anon_label],
+            # ["t_x_id", "t_x_id"]  # if we turn off deduping entirely,
         )
         self._assert_result_keys(sel, ["t_x_id", "t_x_id_1"])
         self._assert_subq_result_keys(sel, ["t_x_id", "t_x_id_1"])
@@ -2803,7 +2804,13 @@ class WithLabelsTest(fixtures.TestBase):
             list(sel.selected_columns.keys()),
             ["t_x_a", t2.c.a._label_anon_label],
         )
+
+        # deduping for different cols but same label
         eq_(list(sel.subquery().c.keys()), ["t_x_a", t2.c.a._label_anon_label])
+
+        # if we turn off deduping entirely
+        # eq_(list(sel.subquery().c.keys()), ["t_x_a", "t_x_a"])
+
         self._assert_result_keys(sel, ["t_x_id", "t_x_id_1"])
         self._assert_subq_result_keys(sel, ["t_x_id", "t_x_id_1"])