Add base32hex support to encode() and decode() functions.

author Masahiko Sawada <msawada@postgresql.org>

Wed, 25 Mar 2026 18:35:19 +0000 (11:35 -0700)

committer Masahiko Sawada <msawada@postgresql.org>

Wed, 25 Mar 2026 18:35:19 +0000 (11:35 -0700)
author Masahiko Sawada <msawada@postgresql.org>
Wed, 25 Mar 2026 18:35:19 +0000 (11:35 -0700)
committer Masahiko Sawada <msawada@postgresql.org>
Wed, 25 Mar 2026 18:35:19 +0000 (11:35 -0700)
diff --git a/doc/src/sgml/func/func-binarystring.sgml b/doc/src/sgml/func/func-binarystring.sgml

index b256381e01f06e60d012130058564b536b7d58d1..0aaf9bc68f1968dee5efe35477ccc3bf3399097b 100644 (file)
--- a/doc/src/sgml/func/func-binarystring.sgml
+++ b/doc/src/sgml/func/func-binarystring.sgml
@@ -727,6 +727,7 @@
        <para>
         Encodes binary data into a textual representation; supported
         <parameter>format</parameter> values are:
+       <link linkend="encode-format-base32hex"><literal>base32hex</literal></link>,
         <link linkend="encode-format-base64"><literal>base64</literal></link>,
         <link linkend="encode-format-base64url"><literal>base64url</literal></link>,
         <link linkend="encode-format-escape"><literal>escape</literal></link>,
@@ -766,6 +767,32 @@
     functions support the following textual formats:
  
     <variablelist>
+    <varlistentry id="encode-format-base32hex">
+     <term>base32hex
+      <indexterm>
+       <primary>base32hex format</primary>
+      </indexterm></term>
+     <listitem>
+      <para>
+       The <literal>base32hex</literal> format is that of
+       <ulink url="https://datatracker.ietf.org/doc/html/rfc4648#section-7">
+       RFC 4648 Section 7</ulink>.  It uses the extended hex alphabet
+       (<literal>0</literal>-<literal>9</literal> and
+       <literal>A</literal>-<literal>V</literal>) which preserves the lexicographical
+       sort order of the encoded data. The <function>encode</function> function
+       produces output padded with <literal>'='</literal>, while <function>decode</function>
+       accepts both padded and unpadded input. Decoding is case-insensitive and ignores
+       whitespace characters.
+      </para>
+      <para>
+       This format is useful for encoding UUIDs in a compact, sortable format:
+       <literal>rtrim(encode(uuid_value::bytea, 'base32hex'), '=')</literal>
+       produces a 26-character string compared to the standard 36-character
+       UUID representation.
+      </para>
+     </listitem>
+    </varlistentry>
+
      <varlistentry id="encode-format-base64">
       <term>base64
       <indexterm>
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c

index f5f835e944ab422b01f3520caf735c0c57428d39..9ea3ddb49ec0346f0ca140307ef68bf86212b80b 100644 (file)
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -65,8 +65,8 @@ binary_encode(PG_FUNCTION_ARGS)
                 ereport(ERROR,
                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                  errmsg("unrecognized encoding: \"%s\"", namebuf),
-                                errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".",
-                                                "base64", "base64url", "escape", "hex")));
+                                errhint("Valid encodings are \"%s\", \"%s\", \"%s\", \"%s\", and \"%s\".",
+                                                "base32hex", "base64", "base64url", "escape", "hex")));
  
         dataptr = VARDATA_ANY(data);
         datalen = VARSIZE_ANY_EXHDR(data);
@@ -115,8 +115,8 @@ binary_decode(PG_FUNCTION_ARGS)
                 ereport(ERROR,
                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                  errmsg("unrecognized encoding: \"%s\"", namebuf),
-                                errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".",
-                                                "base64", "base64url", "escape", "hex")));
+                                errhint("Valid encodings are \"%s\", \"%s\", \"%s\", \"%s\", and \"%s\".",
+                                                "base32hex", "base64", "base64url", "escape", "hex")));
  
         dataptr = VARDATA_ANY(data);
         datalen = VARSIZE_ANY_EXHDR(data);
@@ -825,6 +825,153 @@ esc_dec_len(const char *src, size_t srclen)
         return len;
  }
  
+/*
+ * BASE32HEX
+ */
+
+static const char base32hex_table[] = "0123456789ABCDEFGHIJKLMNOPQRSTUV";
+
+static const int8 b32hexlookup[128] = {
+       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
+       -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+       25, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+       -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+       25, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+};
+
+static uint64
+base32hex_enc_len(const char *src, size_t srclen)
+{
+       /* 5 bytes encode to 8 characters, round up to multiple of 8 for padding */
+       return ((uint64) srclen + 4) / 5 * 8;
+}
+
+static uint64
+base32hex_dec_len(const char *src, size_t srclen)
+{
+       /* Each 8 characters of input produces at most 5 bytes of output */
+       return ((uint64) srclen * 5) / 8;
+}
+
+static uint64
+base32hex_encode(const char *src, size_t srclen, char *dst)
+{
+       const unsigned char *data = (const unsigned char *) src;
+       uint32          bits_buffer = 0;
+       int                     bits_in_buffer = 0;
+       uint64          output_pos = 0;
+       size_t          i;
+
+       for (i = 0; i < srclen; i++)
+       {
+               /* Add 8 bits to the buffer */
+               bits_buffer = (bits_buffer << 8) | data[i];
+               bits_in_buffer += 8;
+
+               /* Extract 5-bit chunks while we have enough bits */
+               while (bits_in_buffer >= 5)
+               {
+                       bits_in_buffer -= 5;
+                       /* Extract top 5 bits */
+                       dst[output_pos++] = base32hex_table[(bits_buffer >> bits_in_buffer) & 0x1F];
+                       /* Clear the extracted bits by masking */
+                       bits_buffer &= ((1U << bits_in_buffer) - 1);
+               }
+       }
+
+       /* Handle remaining bits (if any) */
+       if (bits_in_buffer > 0)
+               dst[output_pos++] = base32hex_table[(bits_buffer << (5 - bits_in_buffer)) & 0x1F];
+
+       /* Add padding to make length a multiple of 8 (per RFC 4648) */
+       while (output_pos % 8 != 0)
+               dst[output_pos++] = '=';
+
+       return output_pos;
+}
+
+static uint64
+base32hex_decode(const char *src, size_t srclen, char *dst)
+{
+       const char *srcend = src + srclen,
+                          *s = src;
+       uint32          bits_buffer = 0;
+       int                     bits_in_buffer = 0;
+       uint64          output_pos = 0;
+       int                     pos = 0;                /* position within 8-character group (0-7) */
+       bool            end = false;    /* have we seen padding? */
+
+       while (s < srcend)
+       {
+               char            c = *s++;
+               int                     val;
+
+               /* Skip whitespace */
+               if (c == ' ' || c == '\t' || c == '\n' || c == '\r')
+                       continue;
+
+               if (c == '=')
+               {
+                       /*
+                        * The first padding is only valid at positions 2, 4, 5, or 7
+                        * within an 8-character group (corresponding to 1, 2, 3, or 4
+                        * input bytes). We only check the position for the first '='
+                        * character.
+                        */
+                       if (!end)
+                       {
+                               if (pos != 2 && pos != 4 && pos != 5 && pos != 7)
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                                        errmsg("unexpected \"=\" while decoding base32hex sequence")));
+                               end = true;
+                       }
+                       pos++;
+                       continue;
+               }
+
+               /* No data characters allowed after padding */
+               if (end)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence",
+                                                       pg_mblen_range(s - 1, srcend), s - 1)));
+
+               /* Decode base32hex character (0-9, A-V, case-insensitive) */
+               val = -1;
+               if ((unsigned char) c < 128)
+                       val = b32hexlookup[(unsigned char) c];
+               if (val < 0)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence",
+                                                       pg_mblen_range(s - 1, srcend), s - 1)));
+
+               /* Add 5 bits to buffer */
+               bits_buffer = (bits_buffer << 5) | val;
+               bits_in_buffer += 5;
+               pos++;
+
+               /* Extract 8-bit bytes when we have enough bits */
+               while (bits_in_buffer >= 8)
+               {
+                       bits_in_buffer -= 8;
+                       dst[output_pos++] = (unsigned char) (bits_buffer >> bits_in_buffer);
+                       /* Clear the extracted bits */
+                       bits_buffer &= ((1U << bits_in_buffer) - 1);
+               }
+
+               /* Reset position after each complete 8-character group */
+               if (pos == 8)
+                       pos = 0;
+       }
+
+       return output_pos;
+}
+
  /*
   * Common
   */
@@ -854,6 +1001,12 @@ static const struct
                         pg_base64url_enc_len, pg_base64url_dec_len, pg_base64url_encode, pg_base64url_decode
                 }
         },
+       {
+               "base32hex",
+               {
+                       base32hex_enc_len, base32hex_dec_len, base32hex_encode, base32hex_decode
+               }
+       },
         {
                 "escape",
                 {
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out

index f38688b5c37eb6efdcd24d9256e151a7cf1d4a18..a49b75fa1f9932efa3bc54a8533c34c8f3089f5f 100644 (file)
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2600,14 +2600,170 @@ SELECT decode(encode('\x1234567890abcdef00', 'escape'), 'escape');
  -- report an error with a hint listing valid encodings when an invalid encoding is specified
  SELECT encode('\x01'::bytea, 'invalid');  -- error
  ERROR:  unrecognized encoding: "invalid"
-HINT:  Valid encodings are "base64", "base64url", "escape", and "hex".
+HINT:  Valid encodings are "base32hex", "base64", "base64url", "escape", and "hex".
  SELECT decode('00', 'invalid');           -- error
  ERROR:  unrecognized encoding: "invalid"
-HINT:  Valid encodings are "base64", "base64url", "escape", and "hex".
+HINT:  Valid encodings are "base32hex", "base64", "base64url", "escape", and "hex".
  --
--- base64url encoding/decoding
+-- base32hex encoding/decoding
  --
  SET bytea_output TO hex;
+SELECT encode('', 'base32hex');  -- ''
+ encode 
+--------
+ 
+(1 row)
+
+SELECT encode('\x11', 'base32hex');  -- '24======'
+  encode  
+----------
+ 24======
+(1 row)
+
+SELECT encode('\x1122', 'base32hex');  -- '24H0===='
+  encode  
+----------
+ 24H0====
+(1 row)
+
+SELECT encode('\x112233', 'base32hex');  -- '24H36==='
+  encode  
+----------
+ 24H36===
+(1 row)
+
+SELECT encode('\x11223344', 'base32hex');  -- '24H36H0='
+  encode  
+----------
+ 24H36H0=
+(1 row)
+
+SELECT encode('\x1122334455', 'base32hex');  -- '24H36H2L'
+  encode  
+----------
+ 24H36H2L
+(1 row)
+
+SELECT encode('\x112233445566', 'base32hex');  -- '24H36H2LCO======'
+      encode      
+------------------
+ 24H36H2LCO======
+(1 row)
+
+SELECT decode('', 'base32hex');  -- ''
+ decode 
+--------
+ \x
+(1 row)
+
+SELECT decode('24======', 'base32hex');  -- \x11
+ decode 
+--------
+ \x11
+(1 row)
+
+SELECT decode('24H0====', 'base32hex');  -- \x1122
+ decode 
+--------
+ \x1122
+(1 row)
+
+SELECT decode('24H36===', 'base32hex');  -- \x112233
+  decode  
+----------
+ \x112233
+(1 row)
+
+SELECT decode('24H36H0=', 'base32hex');  -- \x11223344
+   decode   
+------------
+ \x11223344
+(1 row)
+
+SELECT decode('24H36H2L', 'base32hex');  -- \x1122334455
+    decode    
+--------------
+ \x1122334455
+(1 row)
+
+SELECT decode('24H36H2LCO======', 'base32hex');  -- \x112233445566
+     decode     
+----------------
+ \x112233445566
+(1 row)
+
+SELECT decode('24h36h2lco', 'base32hex');  -- OK, the encoding is case-insensitive
+     decode     
+----------------
+ \x112233445566
+(1 row)
+
+-- Tests for decoding unpadded base32hex strings. Padding '=' are optional.
+SELECT decode('24', 'base32hex');
+ decode 
+--------
+ \x11
+(1 row)
+
+SELECT decode('24H', 'base32hex');
+ decode 
+--------
+ \x11
+(1 row)
+
+SELECT decode('24H36', 'base32hex');
+  decode  
+----------
+ \x112233
+(1 row)
+
+SELECT decode('24H36H0', 'base32hex');
+   decode   
+------------
+ \x11223344
+(1 row)
+
+SELECT decode('2', 'base32hex'); -- \x, 5 bits isn't enough for a byte, so nothing is emitted
+ decode 
+--------
+ \x
+(1 row)
+
+SELECT decode('11=', 'base32hex');  -- OK, non-zero padding bits are accepted (consistent with base64)
+ decode 
+--------
+ \x08
+(1 row)
+
+SELECT decode('2=', 'base32hex'); -- error
+ERROR:  unexpected "=" while decoding base32hex sequence
+SELECT decode('=', 'base32hex');  -- error
+ERROR:  unexpected "=" while decoding base32hex sequence
+SELECT decode('W', 'base32hex');  -- error
+ERROR:  invalid symbol "W" found while decoding base32hex sequence
+SELECT decode('24H36H0=24', 'base32hex'); -- error
+ERROR:  invalid symbol "2" found while decoding base32hex sequence
+-- Check round-trip capability of base32hex encoding for multiple random UUIDs.
+DO $$
+DECLARE
+  v1 uuid;
+  v2 uuid;
+BEGIN
+  FOR i IN 1..10 LOOP
+    v1 := gen_random_uuid();
+    v2 := decode(encode(v1::bytea, 'base32hex'), 'base32hex')::uuid;
+
+    IF v1 != v2 THEN
+      RAISE EXCEPTION 'base32hex encoding round-trip failed, expected % got %', v1, v2;
+    END IF;
+  END LOOP;
+  RAISE NOTICE 'OK';
+END;
+$$;
+NOTICE:  OK
+--
+-- base64url encoding/decoding
+--
  -- Simple encoding/decoding
  SELECT encode('\x69b73eff', 'base64url');  -- abc-_w
   encode 
diff --git a/src/test/regress/expected/uuid.out b/src/test/regress/expected/uuid.out

index d157ef7d0b33850ed334b9f0ef6e585c4b33a014..142c529e693828a7dbc736106b0332405f5a03de 100644 (file)
--- a/src/test/regress/expected/uuid.out
+++ b/src/test/regress/expected/uuid.out
@@ -13,7 +13,8 @@ CREATE TABLE guid2
  CREATE TABLE guid3
  (
         id SERIAL,
-       guid_field UUID
+       guid_field UUID,
+       guid_encoded text GENERATED ALWAYS AS (encode(guid_field::bytea, 'base32hex')) STORED
  );
  -- inserting invalid data tests
  -- too long
@@ -226,11 +227,20 @@ SELECT count(DISTINCT guid_field) FROM guid1;
  (1 row)
  
  -- test sortability of v7
+INSERT INTO guid3 (guid_field) VALUES ('00000000-0000-0000-0000-000000000000'::uuid);
  INSERT INTO guid3 (guid_field) SELECT uuidv7() FROM generate_series(1, 10);
+INSERT INTO guid3 (guid_field) VALUES ('ffffffff-ffff-ffff-ffff-ffffffffffff'::uuid);
  SELECT array_agg(id ORDER BY guid_field) FROM guid3;
-       array_agg        
-------------------------
- {1,2,3,4,5,6,7,8,9,10}
+          array_agg           
+------------------------------
+ {1,2,3,4,5,6,7,8,9,10,11,12}
+(1 row)
+
+-- make sure base32hex encoding works with UUIDs and preserves ordering
+SELECT array_agg(id ORDER BY guid_encoded) FROM guid3;
+          array_agg           
+------------------------------
+ {1,2,3,4,5,6,7,8,9,10,11,12}
  (1 row)
  
  -- Check the timestamp offsets for v7.
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql

index d8a09737668b967971b59cee26a53103fc958648..5ae0e7da31a38b15041242c97ed4bc66f3555340 100644 (file)
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -835,10 +835,65 @@ SELECT encode('\x01'::bytea, 'invalid');  -- error
  SELECT decode('00', 'invalid');           -- error
  
  --
--- base64url encoding/decoding
+-- base32hex encoding/decoding
  --
  SET bytea_output TO hex;
  
+SELECT encode('', 'base32hex');  -- ''
+SELECT encode('\x11', 'base32hex');  -- '24======'
+SELECT encode('\x1122', 'base32hex');  -- '24H0===='
+SELECT encode('\x112233', 'base32hex');  -- '24H36==='
+SELECT encode('\x11223344', 'base32hex');  -- '24H36H0='
+SELECT encode('\x1122334455', 'base32hex');  -- '24H36H2L'
+SELECT encode('\x112233445566', 'base32hex');  -- '24H36H2LCO======'
+
+SELECT decode('', 'base32hex');  -- ''
+SELECT decode('24======', 'base32hex');  -- \x11
+SELECT decode('24H0====', 'base32hex');  -- \x1122
+SELECT decode('24H36===', 'base32hex');  -- \x112233
+SELECT decode('24H36H0=', 'base32hex');  -- \x11223344
+SELECT decode('24H36H2L', 'base32hex');  -- \x1122334455
+SELECT decode('24H36H2LCO======', 'base32hex');  -- \x112233445566
+
+SELECT decode('24h36h2lco', 'base32hex');  -- OK, the encoding is case-insensitive
+
+-- Tests for decoding unpadded base32hex strings. Padding '=' are optional.
+SELECT decode('24', 'base32hex');
+SELECT decode('24H', 'base32hex');
+SELECT decode('24H36', 'base32hex');
+SELECT decode('24H36H0', 'base32hex');
+
+SELECT decode('2', 'base32hex'); -- \x, 5 bits isn't enough for a byte, so nothing is emitted
+SELECT decode('11=', 'base32hex');  -- OK, non-zero padding bits are accepted (consistent with base64)
+
+SELECT decode('2=', 'base32hex'); -- error
+SELECT decode('=', 'base32hex');  -- error
+SELECT decode('W', 'base32hex');  -- error
+SELECT decode('24H36H0=24', 'base32hex'); -- error
+
+-- Check round-trip capability of base32hex encoding for multiple random UUIDs.
+DO $$
+DECLARE
+  v1 uuid;
+  v2 uuid;
+BEGIN
+  FOR i IN 1..10 LOOP
+    v1 := gen_random_uuid();
+    v2 := decode(encode(v1::bytea, 'base32hex'), 'base32hex')::uuid;
+
+    IF v1 != v2 THEN
+      RAISE EXCEPTION 'base32hex encoding round-trip failed, expected % got %', v1, v2;
+    END IF;
+  END LOOP;
+  RAISE NOTICE 'OK';
+END;
+$$;
+
+
+--
+-- base64url encoding/decoding
+--
+
  -- Simple encoding/decoding
  SELECT encode('\x69b73eff', 'base64url');  -- abc-_w
  SELECT decode('abc-_w', 'base64url');      -- \x69b73eff
diff --git a/src/test/regress/sql/uuid.sql b/src/test/regress/sql/uuid.sql

index f512f4dea1d1794e8d347f4019721e5d6f02c4cf..f2ff00f5ddde78c8812ad8e228c7d233ecc997d2 100644 (file)
--- a/src/test/regress/sql/uuid.sql
+++ b/src/test/regress/sql/uuid.sql
@@ -13,7 +13,8 @@ CREATE TABLE guid2
  CREATE TABLE guid3
  (
         id SERIAL,
-       guid_field UUID
+       guid_field UUID,
+       guid_encoded text GENERATED ALWAYS AS (encode(guid_field::bytea, 'base32hex')) STORED
  );
  
  -- inserting invalid data tests
@@ -116,9 +117,14 @@ INSERT INTO guid1 (guid_field) VALUES (uuidv7(INTERVAL '1 day'));
  SELECT count(DISTINCT guid_field) FROM guid1;
  
  -- test sortability of v7
+INSERT INTO guid3 (guid_field) VALUES ('00000000-0000-0000-0000-000000000000'::uuid);
  INSERT INTO guid3 (guid_field) SELECT uuidv7() FROM generate_series(1, 10);
+INSERT INTO guid3 (guid_field) VALUES ('ffffffff-ffff-ffff-ffff-ffffffffffff'::uuid);
  SELECT array_agg(id ORDER BY guid_field) FROM guid3;
  
+-- make sure base32hex encoding works with UUIDs and preserves ordering
+SELECT array_agg(id ORDER BY guid_encoded) FROM guid3;
+
  -- Check the timestamp offsets for v7.
  --
  -- generate UUIDv7 values with timestamps ranging from 1970 (the Unix epoch year)
author	Masahiko Sawada <msawada@postgresql.org>
	Wed, 25 Mar 2026 18:35:19 +0000 (11:35 -0700)
committer	Masahiko Sawada <msawada@postgresql.org>
	Wed, 25 Mar 2026 18:35:19 +0000 (11:35 -0700)
doc/src/sgml/func/func-binarystring.sgml		patch \| blob \| blame \| history
src/backend/utils/adt/encode.c		patch \| blob \| blame \| history
src/test/regress/expected/strings.out		patch \| blob \| blame \| history
src/test/regress/expected/uuid.out		patch \| blob \| blame \| history
src/test/regress/sql/strings.sql		patch \| blob \| blame \| history
src/test/regress/sql/uuid.sql		patch \| blob \| blame \| history