Separate out bytea sort support from varlena.c

author John Naylor <john.naylor@postgresql.org>

Tue, 16 Dec 2025 08:19:16 +0000 (15:19 +0700)

committer John Naylor <john.naylor@postgresql.org>

Tue, 16 Dec 2025 08:19:16 +0000 (15:19 +0700)
author John Naylor <john.naylor@postgresql.org>
Tue, 16 Dec 2025 08:19:16 +0000 (15:19 +0700)
committer John Naylor <john.naylor@postgresql.org>
Tue, 16 Dec 2025 08:19:16 +0000 (15:19 +0700)
diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c

index 6e7b914c56395fa4d941d3e8c4b43c9534703904..f8524548e4671439e24a5b6f7c63e42c13265f66 100644 (file)
--- a/src/backend/utils/adt/bytea.c
+++ b/src/backend/utils/adt/bytea.c
@@ -15,18 +15,19 @@
  #include "postgres.h"
  
  #include "access/detoast.h"
-#include "catalog/pg_collation_d.h"
-#include "catalog/pg_type_d.h"
+#include "common/hashfn.h"
  #include "common/int.h"
  #include "fmgr.h"
+#include "lib/hyperloglog.h"
  #include "libpq/pqformat.h"
  #include "port/pg_bitutils.h"
+#include "port/pg_bswap.h"
  #include "utils/builtins.h"
  #include "utils/bytea.h"
  #include "utils/fmgrprotos.h"
+#include "utils/guc.h"
  #include "utils/memutils.h"
  #include "utils/sortsupport.h"
-#include "utils/varlena.h"
  #include "varatt.h"
  
  /* GUC variable */
@@ -37,6 +38,19 @@ static bytea *bytea_substring(Datum str, int S, int L,
                                                           bool length_not_specified);
  static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
  
+typedef struct
+{
+       bool            abbreviate;             /* Should we abbreviate keys? */
+       hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
+       hyperLogLogState full_card; /* Full key cardinality state */
+       double          prop_card;              /* Required cardinality proportion */
+} ByteaSortSupport;
+
+/* Static function declarations for sort support */
+static int     byteafastcmp(Datum x, Datum y, SortSupport ssup);
+static Datum bytea_abbrev_convert(Datum original, SortSupport ssup);
+static bool bytea_abbrev_abort(int memtupcount, SortSupport ssup);
+
  /*
   * bytea_catenate
   *     Guts of byteacat(), broken out so it can be used by other functions
@@ -1001,6 +1015,201 @@ bytea_smaller(PG_FUNCTION_ARGS)
         PG_RETURN_BYTEA_P(result);
  }
  
+/*
+ * sortsupport comparison func
+ */
+static int
+byteafastcmp(Datum x, Datum y, SortSupport ssup)
+{
+       bytea      *arg1 = DatumGetByteaPP(x);
+       bytea      *arg2 = DatumGetByteaPP(y);
+       char       *a1p,
+                          *a2p;
+       int                     len1,
+                               len2,
+                               result;
+
+       a1p = VARDATA_ANY(arg1);
+       a2p = VARDATA_ANY(arg2);
+
+       len1 = VARSIZE_ANY_EXHDR(arg1);
+       len2 = VARSIZE_ANY_EXHDR(arg2);
+
+       result = memcmp(a1p, a2p, Min(len1, len2));
+       if ((result == 0) && (len1 != len2))
+               result = (len1 < len2) ? -1 : 1;
+
+       /* We can't afford to leak memory here. */
+       if (PointerGetDatum(arg1) != x)
+               pfree(arg1);
+       if (PointerGetDatum(arg2) != y)
+               pfree(arg2);
+
+       return result;
+}
+
+/*
+ * Conversion routine for sortsupport.  Converts original to abbreviated key
+ * representation.  Our encoding strategy is simple -- pack the first 8 bytes
+ * of the bytea data into a Datum (on little-endian machines, the bytes are
+ * stored in reverse order), and treat it as an unsigned integer.
+ */
+static Datum
+bytea_abbrev_convert(Datum original, SortSupport ssup)
+{
+       const size_t max_prefix_bytes = sizeof(Datum);
+       ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra;
+       bytea      *authoritative = DatumGetByteaPP(original);
+       char       *authoritative_data = VARDATA_ANY(authoritative);
+       Datum           res;
+       char       *pres;
+       int                     len;
+       uint32          hash;
+
+       pres = (char *) &res;
+
+       /* memset(), so any non-overwritten bytes are NUL */
+       memset(pres, 0, max_prefix_bytes);
+       len = VARSIZE_ANY_EXHDR(authoritative);
+
+       /*
+        * Short byteas will have terminating NUL bytes in the abbreviated datum.
+        * Abbreviated comparison need not make a distinction between these NUL
+        * bytes, and NUL bytes representing actual NULs in the authoritative
+        * representation.
+        *
+        * Hopefully a comparison at or past one abbreviated key's terminating NUL
+        * byte will resolve the comparison without consulting the authoritative
+        * representation; specifically, some later non-NUL byte in the longer
+        * bytea can resolve the comparison against a subsequent terminating NUL
+        * in the shorter bytea.  There will usually be what is effectively a
+        * "length-wise" resolution there and then.
+        *
+        * If that doesn't work out -- if all bytes in the longer bytea positioned
+        * at or past the offset of the smaller bytea (first) terminating NUL are
+        * actually representative of NUL bytes in the authoritative binary bytea
+        * (perhaps with some *terminating* NUL bytes towards the end of the
+        * longer bytea iff it happens to still be small) -- then an authoritative
+        * tie-breaker will happen, and do the right thing: explicitly consider
+        * bytea length.
+        */
+       memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
+
+       /*
+        * Maintain approximate cardinality of both abbreviated keys and original,
+        * authoritative keys using HyperLogLog.  Used as cheap insurance against
+        * the worst case, where we do many string abbreviations for no saving in
+        * full memcmp()-based comparisons.  These statistics are used by
+        * bytea_abbrev_abort().
+        *
+        * First, Hash key proper, or a significant fraction of it.  Mix in length
+        * in order to compensate for cases where differences are past
+        * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
+        */
+       hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
+                                                                  Min(len, PG_CACHE_LINE_SIZE)));
+
+       if (len > PG_CACHE_LINE_SIZE)
+               hash ^= DatumGetUInt32(hash_uint32((uint32) len));
+
+       addHyperLogLog(&bss->full_card, hash);
+
+       /* Hash abbreviated key */
+       {
+               uint32          tmp;
+
+               tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);
+               hash = DatumGetUInt32(hash_uint32(tmp));
+       }
+
+       addHyperLogLog(&bss->abbr_card, hash);
+
+       /*
+        * Byteswap on little-endian machines.
+        *
+        * This is needed so that ssup_datum_unsigned_cmp() works correctly on all
+        * platforms.
+        */
+       res = DatumBigEndianToNative(res);
+
+       /* Don't leak memory here */
+       if (PointerGetDatum(authoritative) != original)
+               pfree(authoritative);
+
+       return res;
+}
+
+/*
+ * Callback for estimating effectiveness of abbreviated key optimization, using
+ * heuristic rules.  Returns value indicating if the abbreviation optimization
+ * should be aborted, based on its projected effectiveness.
+ *
+ * This is based on varstr_abbrev_abort(), but some comments have been elided
+ * for brevity. See there for more details.
+ */
+static bool
+bytea_abbrev_abort(int memtupcount, SortSupport ssup)
+{
+       ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra;
+       double          abbrev_distinct,
+                               key_distinct;
+
+       Assert(ssup->abbreviate);
+
+       /* Have a little patience */
+       if (memtupcount < 100)
+               return false;
+
+       abbrev_distinct = estimateHyperLogLog(&bss->abbr_card);
+       key_distinct = estimateHyperLogLog(&bss->full_card);
+
+       /*
+        * Clamp cardinality estimates to at least one distinct value.  While
+        * NULLs are generally disregarded, if only NULL values were seen so far,
+        * that might misrepresent costs if we failed to clamp.
+        */
+       if (abbrev_distinct < 1.0)
+               abbrev_distinct = 1.0;
+
+       if (key_distinct < 1.0)
+               key_distinct = 1.0;
+
+       if (trace_sort)
+       {
+               double          norm_abbrev_card = abbrev_distinct / (double) memtupcount;
+
+               elog(LOG, "bytea_abbrev: abbrev_distinct after %d: %f "
+                        "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
+                        memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
+                        bss->prop_card);
+       }
+
+       /*
+        * If the number of distinct abbreviated keys approximately matches the
+        * number of distinct original keys, continue with abbreviation.
+        */
+       if (abbrev_distinct > key_distinct * bss->prop_card)
+       {
+               /*
+                * Decay required cardinality aggressively after 10,000 tuples.
+                */
+               if (memtupcount > 10000)
+                       bss->prop_card *= 0.65;
+
+               return false;
+       }
+
+       /*
+        * Abort abbreviation strategy.
+        */
+       if (trace_sort)
+               elog(LOG, "bytea_abbrev: aborted abbreviation at %d "
+                        "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
+                        memtupcount, abbrev_distinct, key_distinct, bss->prop_card);
+
+       return true;
+}
+
  Datum
  bytea_sortsupport(PG_FUNCTION_ARGS)
  {
@@ -1009,8 +1218,27 @@ bytea_sortsupport(PG_FUNCTION_ARGS)
  
         oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
  
-       /* Use generic string SortSupport, forcing "C" collation */
-       varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);
+       ssup->comparator = byteafastcmp;
+
+       /*
+        * Set up abbreviation support if requested.
+        */
+       if (ssup->abbreviate)
+       {
+               ByteaSortSupport *bss;
+
+               bss = palloc_object(ByteaSortSupport);
+               bss->abbreviate = true;
+               bss->prop_card = 0.20;
+               initHyperLogLog(&bss->abbr_card, 10);
+               initHyperLogLog(&bss->full_card, 10);
+
+               ssup->ssup_extra = bss;
+               ssup->abbrev_full_comparator = ssup->comparator;
+               ssup->comparator = ssup_datum_unsigned_cmp;
+               ssup->abbrev_converter = bytea_abbrev_convert;
+               ssup->abbrev_abort = bytea_abbrev_abort;
+       }
  
         MemoryContextSwitchTo(oldcontext);
  
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c

index baa5b44ea8dd033c1e7a904084ea1a42fc3fe1d7..8adeb8dadc66e98465001ab87e277359df36bd91 100644 (file)
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -92,7 +92,7 @@ typedef struct
         int                     last_returned;  /* Last comparison result (cache) */
         bool            cache_blob;             /* Does buf2 contain strxfrm() blob, etc? */
         bool            collate_c;
-       Oid                     typid;                  /* Actual datatype (text/bpchar/bytea/name) */
+       Oid                     typid;                  /* Actual datatype (text/bpchar/name) */
         hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
         hyperLogLogState full_card; /* Full key cardinality state */
         double          prop_card;              /* Required cardinality proportion */
@@ -1617,10 +1617,8 @@ bttextsortsupport(PG_FUNCTION_ARGS)
   * Includes locale support, and support for BpChar semantics (i.e. removing
   * trailing spaces before comparison).
   *
- * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
- * same representation.  Callers that always use the C collation (e.g.
- * non-collatable type callers like bytea) may have NUL bytes in their strings;
- * this will not work with any other collation, though.
+ * Relies on the assumption that text, VarChar, and BpChar all have the
+ * same representation.
   */
  void
  varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)
@@ -1983,7 +1981,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)
   * representation.  Our encoding strategy is simple -- pack the first 8 bytes
   * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
   * stored in reverse order), and treat it as an unsigned integer.  When the "C"
- * locale is used, or in case of bytea, just memcpy() from original instead.
+ * locale is used just memcpy() from original instead.
   */
  static Datum
  varstr_abbrev_convert(Datum original, SortSupport ssup)
@@ -2010,30 +2008,8 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
  
         /*
          * If we're using the C collation, use memcpy(), rather than strxfrm(), to
-        * abbreviate keys.  The full comparator for the C locale is always
-        * memcmp().  It would be incorrect to allow bytea callers (callers that
-        * always force the C collation -- bytea isn't a collatable type, but this
-        * approach is convenient) to use strxfrm().  This is because bytea
-        * strings may contain NUL bytes.  Besides, this should be faster, too.
-        *
-        * More generally, it's okay that bytea callers can have NUL bytes in
-        * strings because abbreviated cmp need not make a distinction between
-        * terminating NUL bytes, and NUL bytes representing actual NULs in the
-        * authoritative representation.  Hopefully a comparison at or past one
-        * abbreviated key's terminating NUL byte will resolve the comparison
-        * without consulting the authoritative representation; specifically, some
-        * later non-NUL byte in the longer string can resolve the comparison
-        * against a subsequent terminating NUL in the shorter string.  There will
-        * usually be what is effectively a "length-wise" resolution there and
-        * then.
-        *
-        * If that doesn't work out -- if all bytes in the longer string
-        * positioned at or past the offset of the smaller string's (first)
-        * terminating NUL are actually representative of NUL bytes in the
-        * authoritative binary string (perhaps with some *terminating* NUL bytes
-        * towards the end of the longer string iff it happens to still be small)
-        * -- then an authoritative tie-breaker will happen, and do the right
-        * thing: explicitly consider string length.
+        * abbreviate keys.  The full comparator for the C locale is also
+        * memcmp().  This should be faster than strxfrm().
          */
         if (sss->collate_c)
                 memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));
@@ -2115,9 +2091,6 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)
                  * strxfrm() blob is itself NUL terminated, leaving no danger of
                  * misinterpreting any NUL bytes not intended to be interpreted as
                  * logically representing termination.
-                *
-                * (Actually, even if there were NUL bytes in the blob it would be
-                * okay.  See remarks on bytea case above.)
                  */
                 memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));
         }
@@ -2198,10 +2171,10 @@ varstr_abbrev_abort(int memtupcount, SortSupport ssup)
          * NULLs are generally disregarded, if only NULL values were seen so far,
          * that might misrepresent costs if we failed to clamp.
          */
-       if (abbrev_distinct <= 1.0)
+       if (abbrev_distinct < 1.0)
                 abbrev_distinct = 1.0;
  
-       if (key_distinct <= 1.0)
+       if (key_distinct < 1.0)
                 key_distinct = 1.0;
  
         /*
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list

index 3451538565e8efbff36ef96a271768d0365e51b9..04845d5e6809eb61f6276f42af06e909624f5f13 100644 (file)
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -366,6 +366,7 @@ BulkWriteBuffer
  BulkWriteState
  BumpBlock
  BumpContext
+ByteaSortSupport
  CACHESIGN
  CAC_state
  CCFastEqualFN
author	John Naylor <john.naylor@postgresql.org>
	Tue, 16 Dec 2025 08:19:16 +0000 (15:19 +0700)
committer	John Naylor <john.naylor@postgresql.org>
	Tue, 16 Dec 2025 08:19:16 +0000 (15:19 +0700)
src/backend/utils/adt/bytea.c		patch \| blob \| blame \| history
src/backend/utils/adt/varlena.c		patch \| blob \| blame \| history
src/tools/pgindent/typedefs.list		patch \| blob \| blame \| history