fix: avoid caching the parsing of large queries

author Daniele Varrazzo <daniele.varrazzo@gmail.com>

Mon, 4 Sep 2023 13:42:00 +0000 (14:42 +0100)

committer Daniele Varrazzo <daniele.varrazzo@gmail.com>

Mon, 11 Sep 2023 01:09:42 +0000 (02:09 +0100)
author Daniele Varrazzo <daniele.varrazzo@gmail.com>
Mon, 4 Sep 2023 13:42:00 +0000 (14:42 +0100)
committer Daniele Varrazzo <daniele.varrazzo@gmail.com>
Mon, 11 Sep 2023 01:09:42 +0000 (02:09 +0100)
diff --git a/docs/news.rst b/docs/news.rst

index 12d57e5e540989e8140c22e600387754412c19b7..f8e4834df98c8ce1715051783ff5e99985a467eb 100644 (file)
--- a/docs/news.rst
+++ b/docs/news.rst
@@ -13,6 +13,8 @@ Future releases
  Psycopg 3.1.11 (unreleased)
  ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  
+- Avoid caching the parsing results of large queries to avoid excessive memory
+  usage (:ticket:`#628`).
  - Fix integer overflow in C/binary extension with OID > 2^31 (:ticket:`#630`).
  - Fix building on Solaris and derivatives (:ticket:`#632`).
  - Fix possible lack of critical section guard in async
diff --git a/psycopg/psycopg/_queries.py b/psycopg/psycopg/_queries.py

index 2a7554c30cc2da6c36c4df7b4af264a631156796..7032f94b3fa4ccdbf794837d6813a696e7b9e6ba 100644 (file)
--- a/psycopg/psycopg/_queries.py
+++ b/psycopg/psycopg/_queries.py
@@ -5,9 +5,10 @@ Utility module to manipulate queries
  # Copyright (C) 2020 The Psycopg Team
  
  import re
-from typing import Any, Dict, List, Mapping, Match, NamedTuple, Optional
+from typing import Any, Callable, Dict, List, Mapping, Match, NamedTuple, Optional
  from typing import Sequence, Tuple, Union, TYPE_CHECKING
  from functools import lru_cache
+from typing_extensions import TypeAlias
  
  from . import pq
  from . import errors as e
@@ -19,6 +20,9 @@ from ._encodings import conn_encoding
  if TYPE_CHECKING:
      from .abc import Transformer
  
+MAX_CACHED_STATEMENT_LENGTH = 4096
+MAX_CACHED_STATEMENT_PARAMS = 50
+
  
  class QueryPart(NamedTuple):
      pre: bytes
@@ -67,12 +71,22 @@ class PostgresQuery:
              bquery = query
  
          if vars is not None:
-            (
-                self.query,
-                self._want_formats,
-                self._order,
-                self._parts,
-            ) = _query2pg(bquery, self._encoding)
+            # Avoid caching queries extremely long or with a huge number of
+            # parameters. They are usually generated by ORMs and have poor
+            # cacheablility (e.g. INSERT ... VALUES (...), (...) with varying
+            # numbers of tuples.
+            # see https://github.com/psycopg/psycopg/discussions/628
+            if (
+                len(bquery) <= MAX_CACHED_STATEMENT_LENGTH
+                and len(vars) <= MAX_CACHED_STATEMENT_PARAMS
+            ):
+                f: _Query2Pg = _query2pg
+            else:
+                f = _query2pg_nocache
+
+            (self.query, self._want_formats, self._order, self._parts) = f(
+                bquery, self._encoding
+            )
          else:
              self.query = bquery
              self._want_formats = self._order = None
@@ -119,9 +133,14 @@ class PostgresClientQuery(PostgresQuery):
              bquery = query
  
          if vars is not None:
-            (self.template, self._order, self._parts) = _query2pg_client(
-                bquery, self._encoding
-            )
+            if (
+                len(bquery) <= MAX_CACHED_STATEMENT_LENGTH
+                and len(vars) <= MAX_CACHED_STATEMENT_PARAMS
+            ):
+                f: _Query2PgClient = _query2pg_client
+            else:
+                f = _query2pg_client_nocache
+            (self.template, self._order, self._parts) = f(bquery, self._encoding)
          else:
              self.query = bquery
              self._order = None
@@ -144,8 +163,12 @@ class PostgresClientQuery(PostgresQuery):
              self.params = None
  
  
-@lru_cache()
-def _query2pg(
+_Query2Pg: TypeAlias = Callable[
+    [bytes, str], Tuple[bytes, List[PyFormat], Optional[List[str]], List[QueryPart]]
+]
+
+
+def _query2pg_nocache(
      query: bytes, encoding: str
  ) -> Tuple[bytes, List[PyFormat], Optional[List[str]], List[QueryPart]]:
      """
@@ -195,8 +218,20 @@ def _query2pg(
      return b"".join(chunks), formats, order, parts
  
  
-@lru_cache()
-def _query2pg_client(
+# Note: the cache size is 128 items, but someone has reported throwing ~12k
+# queries (of type `INSERT ... VALUES (...), (...)` with a varying amount of
+# records), and the resulting cache size is >100Mb. So, we will avoid to cache
+# large queries or queries with a large number of params. See
+# https://github.com/sqlalchemy/sqlalchemy/discussions/10270
+_query2pg = lru_cache()(_query2pg_nocache)
+
+
+_Query2PgClient: TypeAlias = Callable[
+    [bytes, str], Tuple[bytes, Optional[List[str]], List[QueryPart]]
+]
+
+
+def _query2pg_client_nocache(
      query: bytes, encoding: str
  ) -> Tuple[bytes, Optional[List[str]], List[QueryPart]]:
      """
@@ -233,6 +268,9 @@ def _query2pg_client(
      return b"".join(chunks), order, parts
  
  
+_query2pg_client = lru_cache()(_query2pg_client_nocache)
+
+
  def _validate_and_reorder_params(
      parts: List[QueryPart], vars: Params, order: Optional[List[str]]
  ) -> Sequence[Any]:
diff --git a/tests/test_client_cursor.py b/tests/test_client_cursor.py

index 18941a54efa0cd1544c0f07620e423a3d609c18c..512722a82d6649f397f3826f5c539f0e5c8d394f 100644 (file)
--- a/tests/test_client_cursor.py
+++ b/tests/test_client_cursor.py
@@ -149,6 +149,30 @@ def test_statusmessage(conn):
      assert cur.statusmessage is None
  
  
+def test_query_parse_cache_size():
+    # Warning: testing internal structures. Test might need refactoring with the code.
+    cache = psycopg._queries._query2pg_client
+    cache.cache_clear()
+    ci = cache.cache_info()
+    h0, m0 = ci.hits, ci.misses
+    tests = [
+        (f"select 1 -- {'x' * 3500}", (), h0, m0 + 1),
+        (f"select 1 -- {'x' * 3500}", (), h0 + 1, m0 + 1),
+        (f"select 1 -- {'x' * 4500}", (), h0 + 1, m0 + 1),
+        (f"select 1 -- {'x' * 4500}", (), h0 + 1, m0 + 1),
+        (f"select 1 -- {'%s' * 40}", ("x",) * 40, h0 + 1, m0 + 2),
+        (f"select 1 -- {'%s' * 40}", ("x",) * 40, h0 + 2, m0 + 2),
+        (f"select 1 -- {'%s' * 60}", ("x",) * 60, h0 + 2, m0 + 2),
+        (f"select 1 -- {'%s' * 60}", ("x",) * 60, h0 + 2, m0 + 2),
+    ]
+    for i, (query, params, hits, misses) in enumerate(tests):
+        pq = psycopg._queries.PostgresClientQuery(psycopg.adapt.Transformer())
+        pq.convert(query, params)
+        ci = cache.cache_info()
+        assert ci.hits == hits, f"at {i}"
+        assert ci.misses == misses, f"at {i}"
+
+
  def test_execute_sql(conn):
      cur = conn.cursor()
      cur.execute(sql.SQL("select {value}").format(value="hello"))
diff --git a/tests/test_cursor.py b/tests/test_cursor.py

index a39ed6755e614a3ed9788b4505af07875a5817cb..ae1ec1c74692f6a348fb8d00f75a70e5420be11d 100644 (file)
--- a/tests/test_cursor.py
+++ b/tests/test_cursor.py
@@ -132,6 +132,30 @@ def test_statusmessage(conn):
      assert cur.statusmessage is None
  
  
+def test_query_parse_cache_size():
+    # Warning: testing internal structures. Test might need refactoring with the code.
+    cache = psycopg._queries._query2pg
+    cache.cache_clear()
+    ci = cache.cache_info()
+    h0, m0 = ci.hits, ci.misses
+    tests = [
+        (f"select 1 -- {'x' * 3500}", (), h0, m0 + 1),
+        (f"select 1 -- {'x' * 3500}", (), h0 + 1, m0 + 1),
+        (f"select 1 -- {'x' * 4500}", (), h0 + 1, m0 + 1),
+        (f"select 1 -- {'x' * 4500}", (), h0 + 1, m0 + 1),
+        (f"select 1 -- {'%s' * 40}", ("x",) * 40, h0 + 1, m0 + 2),
+        (f"select 1 -- {'%s' * 40}", ("x",) * 40, h0 + 2, m0 + 2),
+        (f"select 1 -- {'%s' * 60}", ("x",) * 60, h0 + 2, m0 + 2),
+        (f"select 1 -- {'%s' * 60}", ("x",) * 60, h0 + 2, m0 + 2),
+    ]
+    for i, (query, params, hits, misses) in enumerate(tests):
+        pq = psycopg._queries.PostgresQuery(psycopg.adapt.Transformer())
+        pq.convert(query, params)
+        ci = cache.cache_info()
+        assert ci.hits == hits, f"at {i}"
+        assert ci.misses == misses, f"at {i}"
+
+
  def test_execute_many_results(conn):
      cur = conn.cursor()
      assert cur.nextset() is None
author	Daniele Varrazzo <daniele.varrazzo@gmail.com>
	Mon, 4 Sep 2023 13:42:00 +0000 (14:42 +0100)
committer	Daniele Varrazzo <daniele.varrazzo@gmail.com>
	Mon, 11 Sep 2023 01:09:42 +0000 (02:09 +0100)
docs/news.rst		patch \| blob \| blame \| history
psycopg/psycopg/_queries.py		patch \| blob \| blame \| history
tests/test_client_cursor.py		patch \| blob \| blame \| history
tests/test_cursor.py		patch \| blob \| blame \| history