From: Oliver Parker Date: Wed, 3 Jun 2026 06:51:13 +0000 (-0400) Subject: Perf/conditional unique selectinload X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=efa87e25009f815f47a2a3a2a0e639bddf33a459;p=thirdparty%2Fsqlalchemy%2Fsqlalchemy.git Perf/conditional unique selectinload Optimized :func:`_orm.selectinload` to skip the ``.unique()`` call on inner result sets when no nested :func:`_orm.joinedload` on a collection is present. The uniquing pass is only required when a joined eager load inflates rows due to a one-to-many or many-to-many JOIN; in the common case of a leaf selectin load, rows are already unique by construction and the per-row hashing overhead can be avoided. As a side effect, ``yield_per`` set in a ``do_orm_execute`` event for a :func:`_orm.selectinload` relationship load no longer raises ``InvalidRequestError`` when no nested collection joinedload is in effect, since ``.unique()`` is no longer called in that path. Pull request courtesy Oliver Parker. `_SelectInLoader._load_via_parent` and `_load_via_child` currently call `.unique()` unconditionally on the inner `Result`. The uniqueness pass is only required when a nested `joinedload` on a collection is in effect — in that case the `JOIN` inflates rows (one row per `(child, grandchild)` instead of one per child) and the outer `groupby` would produce duplicated entries without dedup. `loading.instances()` already signals exactly this condition: it sets `Result._unique_filter_state` to a `require_unique` guard when the inner compile state has `multi_row_eager_loaders=True`. When that flag is unset (no nested collection `joinedload`), `_unique_filter_state` stays `None`, meaning the inner query produces unique rows by construction: - `omit_join` 1:N — one row per child - `omit_join` M2O — one row per parent - `omit_join` M2M — one row per (parent, entity) - `load_with_join` (non-omit) — one row per (parent, child) This PR makes the `.unique()` call conditional, via a new `_has_unique_filter` property on `Result` that exposes this state without reaching into the private `_unique_filter_state` attribute directly: ```python if result._has_unique_filter: result = result.unique() ``` Per-row hashing in `_iterator_getter` is avoided in the common leaf-load case. On an in-memory SQLite bench (2000 parents × 5 children + M:N tags, Python 3.14, n=1000 iterations): | Workload | before avg | after avg | delta | before sd | after sd | |---|---|---|---|---|---| | selectin children (1:N) | 56.0 ms | 41.4 ms | **−26%** | 5.2 ms | 1.9 ms | | selectin tags M2M | 25.6 ms | 20.0 ms | **−22%** | 4.2 ms | 3.1 ms | | joined children (1:N) | 42.9 ms | 37.6 ms | ~0% | — | — | | plain parent load | 4.2 ms | 3.6 ms | ~0% | — | — | **Behaviour change:** `yield_per` set in a `do_orm_execute` event for a relationship load no longer raises `InvalidRequestError("Can't use yield_per in conjunction with unique")` for `selectinload` without a nested collection `joinedload` — because `.unique()` is no longer called in that path. `immediateload` is unaffected (it still calls `.unique()` unconditionally). Fixes: #13339 Closes: #13341 Pull-request: https://github.com/sqlalchemy/sqlalchemy/pull/13341 Pull-request-sha: 980af0383b174271bb602a890f595c26f78ddcee Change-Id: I12e993f104d354ae894f81f9b09fbcf9a95b0027 --- diff --git a/doc/build/changelog/unreleased_21/13339.rst b/doc/build/changelog/unreleased_21/13339.rst new file mode 100644 index 0000000000..43dbbfe071 --- /dev/null +++ b/doc/build/changelog/unreleased_21/13339.rst @@ -0,0 +1,14 @@ +.. change:: + :tags: usecase, orm, performance + :tickets: 13339 + + Optimized :func:`_orm.selectinload` to skip the ``.unique()`` call on inner + result sets when no nested :func:`_orm.joinedload` on a collection is + present. The uniquing pass is only required when a joined eager load + inflates rows due to a one-to-many or many-to-many JOIN; in the common case + of a leaf selectin load, rows are already unique by construction and the + per-row hashing overhead can be avoided. As a side effect, ``yield_per`` + set in a ``do_orm_execute`` event for a :func:`_orm.selectinload` + relationship load no longer raises ``InvalidRequestError`` when no nested + collection joinedload is in effect, since ``.unique()`` is no longer called + in that path. Pull request courtesy Oliver Parker. diff --git a/lib/sqlalchemy/engine/result.py b/lib/sqlalchemy/engine/result.py index 280449f7e6..d5f31f8e70 100644 --- a/lib/sqlalchemy/engine/result.py +++ b/lib/sqlalchemy/engine/result.py @@ -1826,11 +1826,13 @@ class IteratorResult(Result[Unpack[_Ts]]): iterator: Iterator[_InterimSupportsScalarsRowType], raw: Optional[Result[Any]] = None, _source_supports_scalars: bool = False, + context: Optional[Any] = None, ): self._metadata = cursor_metadata self.iterator = iterator self.raw = raw self._source_supports_scalars = _source_supports_scalars + self.context = context @property def closed(self) -> bool: @@ -1933,6 +1935,7 @@ class ChunkedIteratorResult(IteratorResult[Unpack[_Ts]]): source_supports_scalars: bool = False, raw: Optional[Result[Any]] = None, dynamic_yield_per: bool = False, + context: Optional[Any] = None, ): self._metadata = cursor_metadata self.chunks = chunks @@ -1940,6 +1943,7 @@ class ChunkedIteratorResult(IteratorResult[Unpack[_Ts]]): self.raw = raw self.iterator = itertools.chain.from_iterable(self.chunks(None)) self.dynamic_yield_per = dynamic_yield_per + self.context = context @_generative def yield_per(self, num: int) -> Self: diff --git a/lib/sqlalchemy/orm/context.py b/lib/sqlalchemy/orm/context.py index d09956537d..ab6b7b3117 100644 --- a/lib/sqlalchemy/orm/context.py +++ b/lib/sqlalchemy/orm/context.py @@ -133,6 +133,10 @@ class QueryContext: post_load_paths: Dict[PathRegistry, _PostLoad] compile_state: _ORMCompileState + @property + def requires_uniquing(self) -> bool: + return bool(self.compile_state.multi_row_eager_loaders) + class default_load_options(Options): _only_return_tuples = False _populate_existing = False diff --git a/lib/sqlalchemy/orm/loading.py b/lib/sqlalchemy/orm/loading.py index 2b1ff19acb..6bae934508 100644 --- a/lib/sqlalchemy/orm/loading.py +++ b/lib/sqlalchemy/orm/loading.py @@ -281,6 +281,7 @@ def instances( source_supports_scalars=single_entity, raw=cursor, dynamic_yield_per=cursor.context._is_server_side, + context=context, ) # filtered and single_entity are used to indicate to legacy Query that the @@ -291,7 +292,7 @@ def instances( ) # multi_row_eager_loaders OTOH is specific to joinedload. - if context.compile_state.multi_row_eager_loaders: + if context.requires_uniquing: def require_unique(obj): raise sa_exc.InvalidRequestError( diff --git a/lib/sqlalchemy/orm/strategies.py b/lib/sqlalchemy/orm/strategies.py index 3bfb511a11..dae48d2e6d 100644 --- a/lib/sqlalchemy/orm/strategies.py +++ b/lib/sqlalchemy/orm/strategies.py @@ -3416,19 +3416,18 @@ class _SelectInLoader(_PostLoader, util.MemoizedSlots): while our_keys: chunk = our_keys[0:chunksize] our_keys = our_keys[chunksize:] - data = { - k: v - for k, v in context.session.execute( - q, - params={ - "primary_keys": [ - key[0] if query_info.zero_idx else key - for key in chunk - ] - }, - execution_options=execution_options, - ).unique() - } + result = context.session.execute( + q, + params={ + "primary_keys": [ + key[0] if query_info.zero_idx else key for key in chunk + ] + }, + execution_options=execution_options, + ) + if result.context is not None and result.context.requires_uniquing: + result = result.unique() + data = {k: v for k, v in result} for key in chunk: # for a real foreign key and no concurrent changes to the @@ -3470,15 +3469,15 @@ class _SelectInLoader(_PostLoader, util.MemoizedSlots): for key, state, state_dict, overwrite in chunk ] + result = context.session.execute( + q, + params={"primary_keys": primary_keys}, + execution_options=execution_options, + ) + if result.context is not None and result.context.requires_uniquing: + result = result.unique() data = collections.defaultdict(list) - for k, v in itertools.groupby( - context.session.execute( - q, - params={"primary_keys": primary_keys}, - execution_options=execution_options, - ).unique(), - lambda x: x[0], - ): + for k, v in itertools.groupby(result, lambda x: x[0]): data[k].extend(vv[1] for vv in v) for key, state, state_dict, overwrite in chunk: diff --git a/test/orm/test_events.py b/test/orm/test_events.py index 11756e365d..b39d3f533c 100644 --- a/test/orm/test_events.py +++ b/test/orm/test_events.py @@ -528,8 +528,14 @@ class ORMExecuteTest( "autoflush": True, "is_post_load": True, } + # yield_per conflicts with unique() inside the loader. immediateload + # calls unique() unconditionally; selectinload only calls unique() when + # a nested joinedload on a collection inflates rows, so without one it + # no longer conflicts. operation_should_fail = ( - event_style.set_in_event_for_relationship and option_type.yield_per + event_style.set_in_event_for_relationship + and option_type.yield_per + and loader_opt is immediateload ) if event_style.noop_event: @@ -544,8 +550,7 @@ class ORMExecuteTest( # event where we actually set these options for relationship # loaders. assert that what we do here does in fact get # to those loaders. the yield_per setting will fail for - # immediateload and selectinload because they both use unique() - # on the result. + # immediateload because it calls unique() unconditionally. def go(context): if context.is_relationship_load: if option_type.yield_per: diff --git a/test/orm/test_selectin_relations.py b/test/orm/test_selectin_relations.py index ba1d7e4f56..af6f8fe5cf 100644 --- a/test/orm/test_selectin_relations.py +++ b/test/orm/test_selectin_relations.py @@ -4227,6 +4227,42 @@ class M2MOmitJoinTest( [3], ) + def test_m2m_selectin_no_duplicate_children(self, simple_m2m, connection): + """selectinload over m2m (omit_join fast path) must load the correct + collection members and must not produce duplicate items. + + The simple_m2m fixture has a1.bs=[b1,b2] and a2.bs=[b1]. + """ + A = simple_m2m + + with Session(connection) as session: + + def go(): + statement = ( + select(A).options(selectinload(A.bs)).order_by(A.id) + ) + return session.execute(statement).scalars().all() + + results = self.assert_sql_execution( + connection, + go, + CompiledSQL("SELECT a.id FROM a ORDER BY a.id", {}), + CompiledSQL( + "SELECT a_b.a_id, b.id " + "FROM a_b JOIN b ON b.id = a_b.b_id " + "WHERE a_b.a_id IN " + "(__[POSTCOMPILE_primary_keys])", + {"primary_keys": [1, 2]}, + ), + ) + + eq_(sorted(b.id for b in results[0].bs), [1, 2]) + eq_(sorted(b.id for b in results[1].bs), [1]) + + for a in results: + b_ids = [b.id for b in a.bs] + eq_(len(b_ids), len(set(b_ids))) + class SameNamePolymorphicTest(fixtures.DeclarativeMappedTest): @classmethod @@ -4412,6 +4448,150 @@ class TestBakedCancelsCorrectly(fixtures.DeclarativeMappedTest): self.assert_sql_count(testing.db, go, 2) +class TestSelectinWithNestedJoinedCollectionDedup( + fixtures.DeclarativeMappedTest +): + """Regression guard for selectinload(...).joinedload/selectinload(...) on a + collection. + + When the inner selectin query uses joinedload on a collection, the result + rows are multiplied (one row per grandchild). The .unique() call in + _load_via_parent must deduplicate those rows so that each parent row ends + up with exactly one copy of each child object. + + Also verifies that the joinedload is folded into the selectin query rather + than issuing a separate third query — the total SQL count must be 2 for + joinedload and 3 for nested selectinload. + """ + + @classmethod + def setup_classes(cls): + Base = cls.DeclarativeBasic + + class User(ComparableEntity, Base): + __tablename__ = "sel_dedup_user" + id = Column(Integer, primary_key=True) + name = Column(String(50)) + addresses = relationship( + "Address", back_populates="user", order_by="Address.id" + ) + + class Address(ComparableEntity, Base): + __tablename__ = "sel_dedup_address" + id = Column(Integer, primary_key=True) + user_id = Column(ForeignKey("sel_dedup_user.id")) + email = Column(String(50)) + user = relationship("User", back_populates="addresses") + dingalings = relationship( + "Dingaling", back_populates="address", order_by="Dingaling.id" + ) + + class Dingaling(ComparableEntity, Base): + __tablename__ = "sel_dedup_dingaling" + id = Column(Integer, primary_key=True) + address_id = Column(ForeignKey("sel_dedup_address.id")) + data = Column(String(50)) + address = relationship("Address", back_populates="dingalings") + + @classmethod + def insert_data(cls, connection): + User, Address, Dingaling = cls.classes("User", "Address", "Dingaling") + sess = Session(connection) + # user 1: one address with TWO dingalings — this is the key fixture. + # When selectinload(User.addresses).joinedload(Address.dingalings) runs + # the inner selectin query, address 1 will appear in 2 result rows + # (one per dingaling). Without .unique(), loading.require_unique would + # raise InvalidRequestError: "The unique() method must be invoked on + # this Result". + sess.add( + User( + id=1, + name="u1", + addresses=[ + Address( + id=1, + email="a1@example.com", + dingalings=[ + Dingaling(id=1, data="d1"), + Dingaling(id=2, data="d2"), + ], + ), + Address(id=2, email="a2@example.com"), + ], + ) + ) + # user 2: one address, one dingaling — control case + sess.add( + User( + id=2, + name="u2", + addresses=[ + Address( + id=3, + email="a3@example.com", + dingalings=[Dingaling(id=3, data="d3")], + ) + ], + ) + ) + sess.commit() + + @testing.combinations( + ("joinedload", 2), + ("selectinload", 3), + id_="sa", + argnames="inner_loader_name,expected_sql_count", + ) + def test_selectin_with_nested_joined_collection_still_dedupes( + self, inner_loader_name, expected_sql_count + ): + """Regression: selectinload(...).joinedload/selectinload(...) on a + collection must continue to dedupe inner rows after the + conditional-unique optimization. + """ + User, Address, Dingaling = self.classes("User", "Address", "Dingaling") + sess = fixture_session() + + if inner_loader_name == "joinedload": + inner_opt = selectinload(User.addresses).joinedload( + Address.dingalings + ) + else: + inner_opt = selectinload(User.addresses).selectinload( + Address.dingalings + ) + + def go(): + # expunge so we get fresh queries on each call + sess.expunge_all() + return sess.query(User).options(inner_opt).order_by(User.id).all() + + users = self.assert_sql_count(testing.db, go, expected_sql_count) + + eq_(len(users), 2) + + u1 = users[0] + # Address 1 has 2 dingalings; without .unique() the selectin result + # would produce 2 rows for address 1 and loading.require_unique would + # raise InvalidRequestError. The assertions below are belt-and- + # suspenders for if the conditional logic changes. + address_ids = [a.id for a in u1.addresses] + eq_(len(address_ids), len(set(address_ids))) + eq_(len(u1.addresses), 2) + + # Also verify the grandchildren loaded correctly + target_address = next(a for a in u1.addresses if a.id == 1) + eq_(len(target_address.dingalings), 2) + + u2 = users[1] + address_ids2 = [a.id for a in u2.addresses] + assert len(address_ids2) == len( + set(address_ids2) + ), f"user {u2.id} has duplicate addresses: {address_ids2}" + eq_(len(u2.addresses), 1) + eq_(len(u2.addresses[0].dingalings), 1) + + class TestCompositePlusNonComposite(fixtures.DeclarativeMappedTest): __requires__ = ("tuple_in",)