From: Mike Bayer Date: Thu, 10 Nov 2022 22:01:58 +0000 (-0500) Subject: ensure anon_map is passed for most annotated traversals X-Git-Tag: rel_2_0_0b4~62^2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1d8833a9c1ada64cfc716109a8836b32cb8a9bd6;p=thirdparty%2Fsqlalchemy%2Fsqlalchemy.git ensure anon_map is passed for most annotated traversals We can cache the annotated cache key for Table, but for selectables it's not safe, as it fails to pass the anon_map along and creates many redudant structures in observed test scenario. It is likely safe for a Column that's mapped to a Table also, however this is not implemented here. Will have to see if that part needs adjusting. Fixed critical memory issue identified in cache key generation, where for very large and complex ORM statements that make use of lots of ORM aliases with subqueries, cache key generation could produce excessively large keys that were orders of magnitude bigger than the statement itself. Much thanks to Rollo Konig Brock for their very patient, long term help in finally identifying this issue. Also within TypeEngine objects, when we generate elements for instance variables, skip the None elements at least. this also saves on tuple complexity. Fixes: #8790 Change-Id: I448ddbfb45ae0a648815be8dad4faad7d1977427 --- diff --git a/doc/build/changelog/unreleased_14/8790.rst b/doc/build/changelog/unreleased_14/8790.rst new file mode 100644 index 0000000000..a3214801c8 --- /dev/null +++ b/doc/build/changelog/unreleased_14/8790.rst @@ -0,0 +1,10 @@ +.. change:: + :tags: bug, sql + :tickets: 8790 + + Fixed critical memory issue identified in cache key generation, where for + very large and complex ORM statements that make use of lots of ORM aliases + with subqueries, cache key generation could produce excessively large keys + that were orders of magnitude bigger than the statement itself. Much thanks + to Rollo Konig Brock for their very patient, long term help in finally + identifying this issue. diff --git a/lib/sqlalchemy/sql/annotation.py b/lib/sqlalchemy/sql/annotation.py index 262048bd1d..43ca84abb3 100644 --- a/lib/sqlalchemy/sql/annotation.py +++ b/lib/sqlalchemy/sql/annotation.py @@ -94,12 +94,18 @@ class SupportsAnnotations(ExternallyTraversible): @util.memoized_property def _annotations_cache_key(self) -> Tuple[Any, ...]: anon_map_ = anon_map() + + return self._gen_annotations_cache_key(anon_map_) + + def _gen_annotations_cache_key( + self, anon_map: anon_map + ) -> Tuple[Any, ...]: return ( "_annotations", tuple( ( key, - value._gen_cache_key(anon_map_, []) + value._gen_cache_key(anon_map, []) if isinstance(value, HasCacheKey) else value, ) diff --git a/lib/sqlalchemy/sql/cache_key.py b/lib/sqlalchemy/sql/cache_key.py index 88148285c5..39d09d3ab5 100644 --- a/lib/sqlalchemy/sql/cache_key.py +++ b/lib/sqlalchemy/sql/cache_key.py @@ -297,12 +297,17 @@ class HasCacheKey: else None, ) elif meth is InternalTraversal.dp_annotations_key: - # obj is here is the _annotations dict. however, we - # want to use the memoized cache key version of it. for - # Columns, this should be long lived. For select() - # statements, not so much, but they usually won't have - # annotations. - result += self._annotations_cache_key # type: ignore + # obj is here is the _annotations dict. Table uses + # a memoized version of it. however in other cases, + # we generate it given anon_map as we may be from a + # Join, Aliased, etc. + # see #8790 + + if self._gen_static_annotations_cache_key: # type: ignore # noqa: E501 + result += self._annotations_cache_key # type: ignore # noqa: E501 + else: + result += self._gen_annotations_cache_key(anon_map) # type: ignore # noqa: E501 + elif ( meth is InternalTraversal.dp_clauseelement_list or meth is InternalTraversal.dp_clauseelement_tuple diff --git a/lib/sqlalchemy/sql/elements.py b/lib/sqlalchemy/sql/elements.py index 3b70e8d4e9..6a5aa7db92 100644 --- a/lib/sqlalchemy/sql/elements.py +++ b/lib/sqlalchemy/sql/elements.py @@ -334,6 +334,7 @@ class ClauseElement( _is_column_element = False _is_keyed_column_element = False _is_table = False + _gen_static_annotations_cache_key = False _is_textual = False _is_from_clause = False _is_returns_rows = False @@ -3224,7 +3225,7 @@ class Cast(WrapsColumnExpression[_T]): _traverse_internals: _TraverseInternalsType = [ ("clause", InternalTraversal.dp_clauseelement), - ("typeclause", InternalTraversal.dp_clauseelement), + ("type", InternalTraversal.dp_type), ] clause: ColumnElement[Any] @@ -3631,7 +3632,20 @@ class BinaryExpression(OperatorExpression[_T]): ( "type", InternalTraversal.dp_type, - ), # affects JSON CAST operators + ), + ] + + _cache_key_traversal = [ + ("left", InternalTraversal.dp_clauseelement), + ("right", InternalTraversal.dp_clauseelement), + ("operator", InternalTraversal.dp_operator), + ("modifiers", InternalTraversal.dp_plain_dict), + # "type" affects JSON CAST operators, so while redundant in most cases, + # is needed for that one + ( + "type", + InternalTraversal.dp_type, + ), ] _is_implicitly_boolean = True @@ -3816,6 +3830,10 @@ class Grouping(GroupedElement, ColumnElement[_T]): ("type", InternalTraversal.dp_type), ] + _cache_key_traversal = [ + ("element", InternalTraversal.dp_clauseelement), + ] + element: Union[TextClause, ClauseList, ColumnElement[_T]] def __init__( @@ -4322,6 +4340,11 @@ class Label(roles.LabeledColumnExprRole[_T], NamedColumn[_T]): ("_element", InternalTraversal.dp_clauseelement), ] + _cache_key_traversal = [ + ("name", InternalTraversal.dp_anon_name), + ("_element", InternalTraversal.dp_clauseelement), + ] + _element: ColumnElement[_T] name: str diff --git a/lib/sqlalchemy/sql/schema.py b/lib/sqlalchemy/sql/schema.py index 36c33868af..dde5cd3729 100644 --- a/lib/sqlalchemy/sql/schema.py +++ b/lib/sqlalchemy/sql/schema.py @@ -2023,6 +2023,17 @@ class Column(DialectKWArgs, SchemaItem, ColumnClause[_T]): identity: Optional[Identity] + @util.memoized_property + def _gen_static_annotations_cache_key(self) -> bool: # type: ignore + """special attribute used by cache key gen, if true, we will + use a static cache key for the annotations dictionary, else we + will generate a new cache key for annotations each time. + + Added for #8790 + + """ + return self.table is not None and self.table._is_table + def _extra_kwargs(self, **kwargs: Any) -> None: self._validate_dialect_kwargs(kwargs) diff --git a/lib/sqlalchemy/sql/type_api.py b/lib/sqlalchemy/sql/type_api.py index 90320701ea..cd57ee3b64 100644 --- a/lib/sqlalchemy/sql/type_api.py +++ b/lib/sqlalchemy/sql/type_api.py @@ -940,7 +940,9 @@ class TypeEngine(Visitable, Generic[_T]): else self.__dict__[k], ) for k in names - if k in self.__dict__ and not k.startswith("_") + if k in self.__dict__ + and not k.startswith("_") + and self.__dict__[k] is not None ) @overload diff --git a/lib/sqlalchemy/testing/__init__.py b/lib/sqlalchemy/testing/__init__.py index 0c83cb469a..76445a4442 100644 --- a/lib/sqlalchemy/testing/__init__.py +++ b/lib/sqlalchemy/testing/__init__.py @@ -31,6 +31,7 @@ from .assertions import expect_raises from .assertions import expect_raises_message from .assertions import expect_warnings from .assertions import in_ +from .assertions import int_within_variance from .assertions import is_ from .assertions import is_false from .assertions import is_instance_of diff --git a/lib/sqlalchemy/testing/assertions.py b/lib/sqlalchemy/testing/assertions.py index 44e7e892f8..321c05b446 100644 --- a/lib/sqlalchemy/testing/assertions.py +++ b/lib/sqlalchemy/testing/assertions.py @@ -236,6 +236,17 @@ def _assert_no_stray_pool_connections(): engines.testing_reaper.assert_all_closed() +def int_within_variance(expected, received, variance): + deviance = int(expected * variance) + assert ( + abs(received - expected) < deviance + ), "Given int value %s is not within %d%% of expected value %s" % ( + received, + variance * 100, + expected, + ) + + def eq_regex(a, b, msg=None): assert re.match(b, a), msg or "%r !~ %r" % (a, b) diff --git a/lib/sqlalchemy/testing/util.py b/lib/sqlalchemy/testing/util.py index 6fd42af702..74b1ca9928 100644 --- a/lib/sqlalchemy/testing/util.py +++ b/lib/sqlalchemy/testing/util.py @@ -9,10 +9,13 @@ from __future__ import annotations +from collections import deque import decimal import gc +from itertools import chain import random import sys +from sys import getsizeof import types from . import config @@ -459,3 +462,63 @@ def teardown_events(event_cls): event_cls._clear() return decorate + + +def total_size(o): + """Returns the approximate memory footprint an object and all of its + contents. + + source: https://code.activestate.com/recipes/577504/ + + + """ + + def dict_handler(d): + return chain.from_iterable(d.items()) + + all_handlers = { + tuple: iter, + list: iter, + deque: iter, + dict: dict_handler, + set: iter, + frozenset: iter, + } + seen = set() # track which object id's have already been seen + default_size = getsizeof(0) # estimate sizeof object without __sizeof__ + + def sizeof(o): + if id(o) in seen: # do not double count the same object + return 0 + seen.add(id(o)) + s = getsizeof(o, default_size) + + for typ, handler in all_handlers.items(): + if isinstance(o, typ): + s += sum(map(sizeof, handler(o))) + break + return s + + return sizeof(o) + + +def count_cache_key_tuples(tup): + """given a cache key tuple, counts how many instances of actual + tuples are found. + + used to alert large jumps in cache key complexity. + + """ + stack = [tup] + + sentinel = object() + num_elements = 0 + + while stack: + elem = stack.pop(0) + if elem is sentinel: + num_elements += 1 + elif isinstance(elem, tuple): + if elem: + stack = list(elem) + [sentinel] + stack + return num_elements diff --git a/lib/sqlalchemy/util/__init__.py b/lib/sqlalchemy/util/__init__.py index 4952cb5011..bb4642a4ff 100644 --- a/lib/sqlalchemy/util/__init__.py +++ b/lib/sqlalchemy/util/__init__.py @@ -62,6 +62,7 @@ from .compat import local_dataclass_fields as local_dataclass_fields from .compat import osx as osx from .compat import py310 as py310 from .compat import py311 as py311 +from .compat import py312 as py312 from .compat import py38 as py38 from .compat import py39 as py39 from .compat import pypy as pypy diff --git a/lib/sqlalchemy/util/compat.py b/lib/sqlalchemy/util/compat.py index cda5ab6c12..2899b42586 100644 --- a/lib/sqlalchemy/util/compat.py +++ b/lib/sqlalchemy/util/compat.py @@ -30,6 +30,7 @@ from typing import Tuple from typing import Type +py312 = sys.version_info >= (3, 12) py311 = sys.version_info >= (3, 11) py310 = sys.version_info >= (3, 10) py39 = sys.version_info >= (3, 9) diff --git a/test/orm/test_cache_key.py b/test/orm/test_cache_key.py index 70770089c3..3106a71ad7 100644 --- a/test/orm/test_cache_key.py +++ b/test/orm/test_cache_key.py @@ -8,11 +8,14 @@ from sqlalchemy import Integer from sqlalchemy import literal_column from sqlalchemy import null from sqlalchemy import select +from sqlalchemy import String from sqlalchemy import Table from sqlalchemy import testing from sqlalchemy import text from sqlalchemy import true from sqlalchemy import update +from sqlalchemy import util +from sqlalchemy.ext.declarative import ConcreteBase from sqlalchemy.orm import aliased from sqlalchemy.orm import Bundle from sqlalchemy.orm import defaultload @@ -37,10 +40,15 @@ from sqlalchemy.sql.visitors import InternalTraversal from sqlalchemy.testing import AssertsCompiledSQL from sqlalchemy.testing import eq_ from sqlalchemy.testing import fixtures +from sqlalchemy.testing import int_within_variance from sqlalchemy.testing import ne_ +from sqlalchemy.testing.fixtures import DeclarativeMappedTest from sqlalchemy.testing.fixtures import fixture_session +from sqlalchemy.testing.util import count_cache_key_tuples +from sqlalchemy.testing.util import total_size from test.orm import _fixtures from .inheritance import _poly_fixtures +from .test_events import _RemoveListeners from .test_query import QueryTest @@ -1037,3 +1045,80 @@ class CompositeTest(fixtures.MappedTest): ) eq_(stmt._generate_cache_key(), stmt2._generate_cache_key()) + + +class EmbeddedSubqTest(_RemoveListeners, DeclarativeMappedTest): + """test #8790. + + it's expected that cache key structures will change, this test is here + testing something fairly similar to the issue we had (though vastly + smaller scale) so we mostly want to look for surprise jumps here. + + """ + + @classmethod + def setup_classes(cls): + Base = cls.DeclarativeBasic + + class Employee(ConcreteBase, Base): + __tablename__ = "employee" + id = Column(Integer, primary_key=True) + name = Column(String(50)) + + __mapper_args__ = { + "polymorphic_identity": "employee", + "concrete": True, + } + + class Manager(Employee): + __tablename__ = "manager" + id = Column(Integer, primary_key=True) + name = Column(String(50)) + manager_data = Column(String(40)) + + __mapper_args__ = { + "polymorphic_identity": "manager", + "concrete": True, + } + + class Engineer(Employee): + __tablename__ = "engineer" + id = Column(Integer, primary_key=True) + name = Column(String(50)) + engineer_info = Column(String(40)) + + __mapper_args__ = { + "polymorphic_identity": "engineer", + "concrete": True, + } + + @testing.combinations("tuples", "memory", argnames="assert_on") + def test_cache_key_gen(self, assert_on): + Employee = self.classes.Employee + + e1 = aliased(Employee) + e2 = aliased(Employee) + + subq = select(e1).union_all(select(e2)).subquery() + + anno = aliased(Employee, subq) + + stmt = select(anno) + + ck = stmt._generate_cache_key() + + if assert_on == "tuples": + # before the fix for #8790 this was 700 + int_within_variance(142, count_cache_key_tuples(ck), 0.05) + + elif assert_on == "memory": + # before the fix for #8790 this was 55154 + + if util.py312: + testing.skip_test("python platform not available") + elif util.py311: + int_within_variance(39996, total_size(ck), 0.05) + elif util.py310: + int_within_variance(29796, total_size(ck), 0.05) + else: + testing.skip_test("python platform not available")