From: Daniele Varrazzo <daniele.varrazzo@gmail.com>
Date: Wed, 14 Sep 2022 11:33:00 +0000 (+0100)
Subject: perf: add C implementation of array load
X-Git-Tag: 3.1.5~12^2~12
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=7c3ccb49f4386f6f2543016464e205d99510c680;p=thirdparty%2Fpsycopg.git

perf: add C implementation of array load

This implementation can be still improved: we always call the Python
loader to decode the single elements; we can try and call the C loader
if available.
---

diff --git a/psycopg/psycopg/types/array.py b/psycopg/psycopg/types/array.py
index cefb8fa2f..4fd234660 100644
--- a/psycopg/psycopg/types/array.py
+++ b/psycopg/psycopg/types/array.py
@@ -387,7 +387,7 @@ def _load_text(
             data = bytes(data)
         idx = data.find(b"=")
         if idx == -1:
-            raise e.DataError("malformed array, no '=' after dimension information")
+            raise e.DataError("malformed array: no '=' after dimension information")
         data = data[idx + 1 :]
 
     re_parse = _get_array_parse_regexp(delimiter)
@@ -403,13 +403,13 @@ def _load_text(
 
         elif t == b"}":
             if not stack:
-                raise e.DataError("malformed array, unexpected '}'")
+                raise e.DataError("malformed array: unexpected '}'")
             rv = stack.pop()
 
         else:
             if not stack:
                 wat = t[:10].decode("utf8", "replace") + "..." if len(t) > 10 else ""
-                raise e.DataError(f"malformed array, unexpected '{wat}'")
+                raise e.DataError(f"malformed array: unexpected '{wat}'")
             if t == b"NULL":
                 v = None
             else:
diff --git a/psycopg_c/psycopg_c/types/array.pyx b/psycopg_c/psycopg_c/types/array.pyx
index 7c4b47b81..9495eeb75 100644
--- a/psycopg_c/psycopg_c/types/array.pyx
+++ b/psycopg_c/psycopg_c/types/array.pyx
@@ -4,11 +4,175 @@ C optimized functions to manipulate arrays
 
 # Copyright (C) 2022 The Psycopg Team
 
+from libc.stdint cimport int32_t
+from libc.string cimport strchr
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
+from cpython.ref cimport Py_INCREF
+from cpython.list cimport PyList_New, PyList_SET_ITEM
+
+from psycopg_c.pq cimport _buffer_as_string_and_size
+from psycopg_c._psycopg cimport endian
+
+from psycopg import errors as e
+
+
 def array_load_text(
     data: Buffer, load: LoadFunc, delimiter: bytes = b","
 ) -> List[Any]:
-    raise NotImplementedError
+    cdef char cdelim = delimiter[0]
+
+    cdef char *buf
+    cdef Py_ssize_t length
+    _buffer_as_string_and_size(data, &buf, &length)
+    if length == 0:
+        raise e.DataError("malformed array: empty data")
+
+    cdef char *end = buf + length
+
+    # Remove the dimensions information prefix (``[...]=``)
+    if buf[0] == b"[":
+        buf = strchr(buf + 1, b'=')
+        if buf == NULL:
+            raise e.DataError("malformed array: no '=' after dimension information")
+        buf += 1
+
+    rv = None
+    stack: List[Any] = []
+
+    while buf < end:
+        if buf[0] == b'{':
+            a = []
+            if rv is None:
+                rv = a
+            if stack:
+                stack[-1].append(a)
+            stack.append(a)
+            buf += 1
+
+        elif buf[0] == b'}':
+            if not stack:
+                raise e.DataError("malformed array: unexpected '}'")
+            rv = stack.pop()
+            buf += 1
+
+        elif buf[0] == cdelim:
+            buf += 1
+
+        else:
+            v = parse_token(&buf, end, cdelim, load)
+            if not stack:
+                raise e.DataError("malformed array: missing initial '{'")
+            stack[-1].append(v)
+
+    assert rv is not None
+    return rv
+
+
+cdef object parse_token(char **bufptr, char *bufend, char cdelim, object load):
+    cdef char *start = bufptr[0]
+    cdef int has_quotes = start[0] == b'"'
+    cdef int quoted = has_quotes
+    cdef int num_escapes = 0
+    cdef int escaped = 0
+
+    if has_quotes:
+        start += 1
+    cdef char *end = start
+
+    while end < bufend:
+        if (end[0] == cdelim or end[0] == b'}') and not quoted:
+            break
+        elif end[0] == b'\\' and not escaped:
+            num_escapes += 1
+            escaped = 1
+            end += 1
+            continue
+        elif end[0] == b'"' and not escaped:
+            quoted = 0
+        escaped = 0
+        end += 1
+    else:
+        raise e.DataError("malformed array: hit the end of the buffer")
+
+    # Return the new position for the buffer
+    bufptr[0] = end
+    if has_quotes:
+        end -= 1
+
+    cdef int length = (end - start)
+    if length == 4 and not has_quotes \
+            and start[0] == b'N' and start[1] == b'U' \
+            and start[2] == b'L' and start[3] == b'L':
+        return None
+
+    cdef char *unesc
+    cdef char *src
+    cdef char *tgt
+
+    if not num_escapes:
+        # TODO: fast path for C dumpers and no copy
+        b = start[:length]
+        return load(b)
+
+    else:
+        unesc = <char *>PyMem_Malloc(length - num_escapes)
+        src = start
+        tgt = unesc
+        while src < end:
+            if src[0] == b'\\':
+                src += 1
+            tgt[0] = src[0]
+            src += 1
+            tgt += 1
+
+        try:
+            b = unesc[:length - num_escapes]
+            return load(b)
+        finally:
+            PyMem_Free(unesc)
 
 
 def array_load_binary(data: Buffer, load: LoadFunc) -> List[Any]:
-    raise NotImplementedError
+    cdef char *buf
+    cdef Py_ssize_t length
+    _buffer_as_string_and_size(data, &buf, &length)
+
+    # head is ndims, hasnull, elem oid
+    cdef int32_t *buf32 = <int32_t *>buf
+    cdef int ndims = endian.be32toh(buf32[0])
+
+    if not ndims:
+        return []
+
+    cdef long nelems = 1
+    cdef int dim
+    cdef long i
+    dims = []
+    for i in range(3, 3 + 2 * ndims, 2):
+        # Every dimension is dim, lower bound
+        dim = endian.be32toh(buf32[i])
+        nelems *= dim
+        dims.append(dim)
+
+    buf += (3 + 2 * ndims) * sizeof(int32_t)
+    cdef list out = PyList_New(nelems)
+
+    cdef Py_ssize_t size
+    for i in range(nelems):
+        size = <int32_t>endian.be32toh((<int32_t *>buf)[0])
+        buf += sizeof(int32_t)
+        if size == -1:
+            Py_INCREF(None)
+            PyList_SET_ITEM(out, i, None)
+        else:
+            # TODO: do without copy for C loaders
+            val = load(buf[:size])
+            Py_INCREF(val)
+            PyList_SET_ITEM(out, i, val)
+            buf += size
+
+    # fon ndims > 1 we have to aggregate the array into sub-arrays
+    for dim in dims[-1:0:-1]:
+        out = [out[i : i + dim] for i in range(0, len(out), dim)]
+
+    return out
diff --git a/tests/types/test_array.py b/tests/types/test_array.py
index 11996ad54..74c17a614 100644
--- a/tests/types/test_array.py
+++ b/tests/types/test_array.py
@@ -15,6 +15,7 @@ from psycopg.postgres import types as builtins
 tests_str = [
     ([[[[[["a"]]]]]], "{{{{{{a}}}}}}"),
     ([[[[[[None]]]]]], "{{{{{{NULL}}}}}}"),
+    ([[[[[["NULL"]]]]]], '{{{{{{"NULL"}}}}}}'),
     (["foo", "bar", "baz"], "{foo,bar,baz}"),
     (["foo", None, "baz"], "{foo,null,baz}"),
     (["foo", "null", "", "baz"], '{foo,"null","",baz}'),