From: Daniele Varrazzo <daniele.varrazzo@gmail.com>
Date: Mon, 4 Sep 2023 13:42:00 +0000 (+0100)
Subject: fix: avoid caching the parsing of large queries
X-Git-Tag: pool-3.1.8~14
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=29be5a47342000811eb44baa32ef90fc1af1cd2a;p=thirdparty%2Fpsycopg.git

fix: avoid caching the parsing of large queries

These queries are typically generated by ORMs and have poor
cacheability, but can result in a lot of memory being used.

Close #628.

See also <https://github.com/sqlalchemy/sqlalchemy/discussions/10270>.
---

diff --git a/docs/news.rst b/docs/news.rst
index 12d57e5e5..f8e4834df 100644
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -13,6 +13,8 @@ Future releases
 Psycopg 3.1.11 (unreleased)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Avoid caching the parsing results of large queries to avoid excessive memory
+  usage (:ticket:`#628`).
 - Fix integer overflow in C/binary extension with OID > 2^31 (:ticket:`#630`).
 - Fix building on Solaris and derivatives (:ticket:`#632`).
 - Fix possible lack of critical section guard in async
diff --git a/psycopg/psycopg/_queries.py b/psycopg/psycopg/_queries.py
index 2a7554c30..7032f94b3 100644
--- a/psycopg/psycopg/_queries.py
+++ b/psycopg/psycopg/_queries.py
@@ -5,9 +5,10 @@ Utility module to manipulate queries
 # Copyright (C) 2020 The Psycopg Team
 
 import re
-from typing import Any, Dict, List, Mapping, Match, NamedTuple, Optional
+from typing import Any, Callable, Dict, List, Mapping, Match, NamedTuple, Optional
 from typing import Sequence, Tuple, Union, TYPE_CHECKING
 from functools import lru_cache
+from typing_extensions import TypeAlias
 
 from . import pq
 from . import errors as e
@@ -19,6 +20,9 @@ from ._encodings import conn_encoding
 if TYPE_CHECKING:
     from .abc import Transformer
 
+MAX_CACHED_STATEMENT_LENGTH = 4096
+MAX_CACHED_STATEMENT_PARAMS = 50
+
 
 class QueryPart(NamedTuple):
     pre: bytes
@@ -67,12 +71,22 @@ class PostgresQuery:
             bquery = query
 
         if vars is not None:
-            (
-                self.query,
-                self._want_formats,
-                self._order,
-                self._parts,
-            ) = _query2pg(bquery, self._encoding)
+            # Avoid caching queries extremely long or with a huge number of
+            # parameters. They are usually generated by ORMs and have poor
+            # cacheablility (e.g. INSERT ... VALUES (...), (...) with varying
+            # numbers of tuples.
+            # see https://github.com/psycopg/psycopg/discussions/628
+            if (
+                len(bquery) <= MAX_CACHED_STATEMENT_LENGTH
+                and len(vars) <= MAX_CACHED_STATEMENT_PARAMS
+            ):
+                f: _Query2Pg = _query2pg
+            else:
+                f = _query2pg_nocache
+
+            (self.query, self._want_formats, self._order, self._parts) = f(
+                bquery, self._encoding
+            )
         else:
             self.query = bquery
             self._want_formats = self._order = None
@@ -119,9 +133,14 @@ class PostgresClientQuery(PostgresQuery):
             bquery = query
 
         if vars is not None:
-            (self.template, self._order, self._parts) = _query2pg_client(
-                bquery, self._encoding
-            )
+            if (
+                len(bquery) <= MAX_CACHED_STATEMENT_LENGTH
+                and len(vars) <= MAX_CACHED_STATEMENT_PARAMS
+            ):
+                f: _Query2PgClient = _query2pg_client
+            else:
+                f = _query2pg_client_nocache
+            (self.template, self._order, self._parts) = f(bquery, self._encoding)
         else:
             self.query = bquery
             self._order = None
@@ -144,8 +163,12 @@ class PostgresClientQuery(PostgresQuery):
             self.params = None
 
 
-@lru_cache()
-def _query2pg(
+_Query2Pg: TypeAlias = Callable[
+    [bytes, str], Tuple[bytes, List[PyFormat], Optional[List[str]], List[QueryPart]]
+]
+
+
+def _query2pg_nocache(
     query: bytes, encoding: str
 ) -> Tuple[bytes, List[PyFormat], Optional[List[str]], List[QueryPart]]:
     """
@@ -195,8 +218,20 @@ def _query2pg(
     return b"".join(chunks), formats, order, parts
 
 
-@lru_cache()
-def _query2pg_client(
+# Note: the cache size is 128 items, but someone has reported throwing ~12k
+# queries (of type `INSERT ... VALUES (...), (...)` with a varying amount of
+# records), and the resulting cache size is >100Mb. So, we will avoid to cache
+# large queries or queries with a large number of params. See
+# https://github.com/sqlalchemy/sqlalchemy/discussions/10270
+_query2pg = lru_cache()(_query2pg_nocache)
+
+
+_Query2PgClient: TypeAlias = Callable[
+    [bytes, str], Tuple[bytes, Optional[List[str]], List[QueryPart]]
+]
+
+
+def _query2pg_client_nocache(
     query: bytes, encoding: str
 ) -> Tuple[bytes, Optional[List[str]], List[QueryPart]]:
     """
@@ -233,6 +268,9 @@ def _query2pg_client(
     return b"".join(chunks), order, parts
 
 
+_query2pg_client = lru_cache()(_query2pg_client_nocache)
+
+
 def _validate_and_reorder_params(
     parts: List[QueryPart], vars: Params, order: Optional[List[str]]
 ) -> Sequence[Any]:
diff --git a/tests/test_client_cursor.py b/tests/test_client_cursor.py
index 18941a54e..512722a82 100644
--- a/tests/test_client_cursor.py
+++ b/tests/test_client_cursor.py
@@ -149,6 +149,30 @@ def test_statusmessage(conn):
     assert cur.statusmessage is None
 
 
+def test_query_parse_cache_size():
+    # Warning: testing internal structures. Test might need refactoring with the code.
+    cache = psycopg._queries._query2pg_client
+    cache.cache_clear()
+    ci = cache.cache_info()
+    h0, m0 = ci.hits, ci.misses
+    tests = [
+        (f"select 1 -- {'x' * 3500}", (), h0, m0 + 1),
+        (f"select 1 -- {'x' * 3500}", (), h0 + 1, m0 + 1),
+        (f"select 1 -- {'x' * 4500}", (), h0 + 1, m0 + 1),
+        (f"select 1 -- {'x' * 4500}", (), h0 + 1, m0 + 1),
+        (f"select 1 -- {'%s' * 40}", ("x",) * 40, h0 + 1, m0 + 2),
+        (f"select 1 -- {'%s' * 40}", ("x",) * 40, h0 + 2, m0 + 2),
+        (f"select 1 -- {'%s' * 60}", ("x",) * 60, h0 + 2, m0 + 2),
+        (f"select 1 -- {'%s' * 60}", ("x",) * 60, h0 + 2, m0 + 2),
+    ]
+    for i, (query, params, hits, misses) in enumerate(tests):
+        pq = psycopg._queries.PostgresClientQuery(psycopg.adapt.Transformer())
+        pq.convert(query, params)
+        ci = cache.cache_info()
+        assert ci.hits == hits, f"at {i}"
+        assert ci.misses == misses, f"at {i}"
+
+
 def test_execute_sql(conn):
     cur = conn.cursor()
     cur.execute(sql.SQL("select {value}").format(value="hello"))
diff --git a/tests/test_cursor.py b/tests/test_cursor.py
index a39ed6755..ae1ec1c74 100644
--- a/tests/test_cursor.py
+++ b/tests/test_cursor.py
@@ -132,6 +132,30 @@ def test_statusmessage(conn):
     assert cur.statusmessage is None
 
 
+def test_query_parse_cache_size():
+    # Warning: testing internal structures. Test might need refactoring with the code.
+    cache = psycopg._queries._query2pg
+    cache.cache_clear()
+    ci = cache.cache_info()
+    h0, m0 = ci.hits, ci.misses
+    tests = [
+        (f"select 1 -- {'x' * 3500}", (), h0, m0 + 1),
+        (f"select 1 -- {'x' * 3500}", (), h0 + 1, m0 + 1),
+        (f"select 1 -- {'x' * 4500}", (), h0 + 1, m0 + 1),
+        (f"select 1 -- {'x' * 4500}", (), h0 + 1, m0 + 1),
+        (f"select 1 -- {'%s' * 40}", ("x",) * 40, h0 + 1, m0 + 2),
+        (f"select 1 -- {'%s' * 40}", ("x",) * 40, h0 + 2, m0 + 2),
+        (f"select 1 -- {'%s' * 60}", ("x",) * 60, h0 + 2, m0 + 2),
+        (f"select 1 -- {'%s' * 60}", ("x",) * 60, h0 + 2, m0 + 2),
+    ]
+    for i, (query, params, hits, misses) in enumerate(tests):
+        pq = psycopg._queries.PostgresQuery(psycopg.adapt.Transformer())
+        pq.convert(query, params)
+        ci = cache.cache_info()
+        assert ci.hits == hits, f"at {i}"
+        assert ci.misses == misses, f"at {i}"
+
+
 def test_execute_many_results(conn):
     cur = conn.cursor()
     assert cur.nextset() is None