From: Masahiko Sawada Date: Wed, 25 Mar 2026 18:35:19 +0000 (-0700) Subject: Add base32hex support to encode() and decode() functions. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=497c1170cb1;p=thirdparty%2Fpostgresql.git Add base32hex support to encode() and decode() functions. This adds support for base32hex encoding and decoding, as defined in RFC 4648 Section 7. Unlike standard base32, base32hex uses the extended hex alphabet (0-9, A-V) which preserves the lexicographical order of the encoded data. This is particularly useful for representing UUIDv7 values in a compact string format while maintaining their time-ordered sort property. The encode() function produces output padded with '=', while decode() accepts both padded and unpadded input. Following the behavior of other encoding types, decoding is case-insensitive. Suggested-by: Sergey Prokhorenko Author: Andrey Borodin Co-authored-by: Aleksander Alekseev Reviewed-by: Masahiko Sawada Reviewed-by: Илья Чердаков Reviewed-by: Chengxi Sun Reviewed-by: Chao Li Discussion: https://postgr.es/m/CAJ7c6TOramr1UTLcyB128LWMqita1Y7%3Darq3KHaU%3Dqikf5yKOQ%40mail.gmail.com --- diff --git a/doc/src/sgml/func/func-binarystring.sgml b/doc/src/sgml/func/func-binarystring.sgml index b256381e01f..0aaf9bc68f1 100644 --- a/doc/src/sgml/func/func-binarystring.sgml +++ b/doc/src/sgml/func/func-binarystring.sgml @@ -727,6 +727,7 @@ Encodes binary data into a textual representation; supported format values are: + base32hex, base64, base64url, escape, @@ -766,6 +767,32 @@ functions support the following textual formats: + + base32hex + + base32hex format + + + + The base32hex format is that of + + RFC 4648 Section 7. It uses the extended hex alphabet + (0-9 and + A-V) which preserves the lexicographical + sort order of the encoded data. The encode function + produces output padded with '=', while decode + accepts both padded and unpadded input. Decoding is case-insensitive and ignores + whitespace characters. + + + This format is useful for encoding UUIDs in a compact, sortable format: + rtrim(encode(uuid_value::bytea, 'base32hex'), '=') + produces a 26-character string compared to the standard 36-character + UUID representation. + + + + base64 diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c index f5f835e944a..9ea3ddb49ec 100644 --- a/src/backend/utils/adt/encode.c +++ b/src/backend/utils/adt/encode.c @@ -65,8 +65,8 @@ binary_encode(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized encoding: \"%s\"", namebuf), - errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".", - "base64", "base64url", "escape", "hex"))); + errhint("Valid encodings are \"%s\", \"%s\", \"%s\", \"%s\", and \"%s\".", + "base32hex", "base64", "base64url", "escape", "hex"))); dataptr = VARDATA_ANY(data); datalen = VARSIZE_ANY_EXHDR(data); @@ -115,8 +115,8 @@ binary_decode(PG_FUNCTION_ARGS) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized encoding: \"%s\"", namebuf), - errhint("Valid encodings are \"%s\", \"%s\", \"%s\", and \"%s\".", - "base64", "base64url", "escape", "hex"))); + errhint("Valid encodings are \"%s\", \"%s\", \"%s\", \"%s\", and \"%s\".", + "base32hex", "base64", "base64url", "escape", "hex"))); dataptr = VARDATA_ANY(data); datalen = VARSIZE_ANY_EXHDR(data); @@ -825,6 +825,153 @@ esc_dec_len(const char *src, size_t srclen) return len; } +/* + * BASE32HEX + */ + +static const char base32hex_table[] = "0123456789ABCDEFGHIJKLMNOPQRSTUV"; + +static const int8 b32hexlookup[128] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1, -1, -1, -1, -1, -1, +}; + +static uint64 +base32hex_enc_len(const char *src, size_t srclen) +{ + /* 5 bytes encode to 8 characters, round up to multiple of 8 for padding */ + return ((uint64) srclen + 4) / 5 * 8; +} + +static uint64 +base32hex_dec_len(const char *src, size_t srclen) +{ + /* Each 8 characters of input produces at most 5 bytes of output */ + return ((uint64) srclen * 5) / 8; +} + +static uint64 +base32hex_encode(const char *src, size_t srclen, char *dst) +{ + const unsigned char *data = (const unsigned char *) src; + uint32 bits_buffer = 0; + int bits_in_buffer = 0; + uint64 output_pos = 0; + size_t i; + + for (i = 0; i < srclen; i++) + { + /* Add 8 bits to the buffer */ + bits_buffer = (bits_buffer << 8) | data[i]; + bits_in_buffer += 8; + + /* Extract 5-bit chunks while we have enough bits */ + while (bits_in_buffer >= 5) + { + bits_in_buffer -= 5; + /* Extract top 5 bits */ + dst[output_pos++] = base32hex_table[(bits_buffer >> bits_in_buffer) & 0x1F]; + /* Clear the extracted bits by masking */ + bits_buffer &= ((1U << bits_in_buffer) - 1); + } + } + + /* Handle remaining bits (if any) */ + if (bits_in_buffer > 0) + dst[output_pos++] = base32hex_table[(bits_buffer << (5 - bits_in_buffer)) & 0x1F]; + + /* Add padding to make length a multiple of 8 (per RFC 4648) */ + while (output_pos % 8 != 0) + dst[output_pos++] = '='; + + return output_pos; +} + +static uint64 +base32hex_decode(const char *src, size_t srclen, char *dst) +{ + const char *srcend = src + srclen, + *s = src; + uint32 bits_buffer = 0; + int bits_in_buffer = 0; + uint64 output_pos = 0; + int pos = 0; /* position within 8-character group (0-7) */ + bool end = false; /* have we seen padding? */ + + while (s < srcend) + { + char c = *s++; + int val; + + /* Skip whitespace */ + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') + continue; + + if (c == '=') + { + /* + * The first padding is only valid at positions 2, 4, 5, or 7 + * within an 8-character group (corresponding to 1, 2, 3, or 4 + * input bytes). We only check the position for the first '=' + * character. + */ + if (!end) + { + if (pos != 2 && pos != 4 && pos != 5 && pos != 7) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unexpected \"=\" while decoding base32hex sequence"))); + end = true; + } + pos++; + continue; + } + + /* No data characters allowed after padding */ + if (end) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence", + pg_mblen_range(s - 1, srcend), s - 1))); + + /* Decode base32hex character (0-9, A-V, case-insensitive) */ + val = -1; + if ((unsigned char) c < 128) + val = b32hexlookup[(unsigned char) c]; + if (val < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid symbol \"%.*s\" found while decoding base32hex sequence", + pg_mblen_range(s - 1, srcend), s - 1))); + + /* Add 5 bits to buffer */ + bits_buffer = (bits_buffer << 5) | val; + bits_in_buffer += 5; + pos++; + + /* Extract 8-bit bytes when we have enough bits */ + while (bits_in_buffer >= 8) + { + bits_in_buffer -= 8; + dst[output_pos++] = (unsigned char) (bits_buffer >> bits_in_buffer); + /* Clear the extracted bits */ + bits_buffer &= ((1U << bits_in_buffer) - 1); + } + + /* Reset position after each complete 8-character group */ + if (pos == 8) + pos = 0; + } + + return output_pos; +} + /* * Common */ @@ -854,6 +1001,12 @@ static const struct pg_base64url_enc_len, pg_base64url_dec_len, pg_base64url_encode, pg_base64url_decode } }, + { + "base32hex", + { + base32hex_enc_len, base32hex_dec_len, base32hex_encode, base32hex_decode + } + }, { "escape", { diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index f38688b5c37..a49b75fa1f9 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -2600,14 +2600,170 @@ SELECT decode(encode('\x1234567890abcdef00', 'escape'), 'escape'); -- report an error with a hint listing valid encodings when an invalid encoding is specified SELECT encode('\x01'::bytea, 'invalid'); -- error ERROR: unrecognized encoding: "invalid" -HINT: Valid encodings are "base64", "base64url", "escape", and "hex". +HINT: Valid encodings are "base32hex", "base64", "base64url", "escape", and "hex". SELECT decode('00', 'invalid'); -- error ERROR: unrecognized encoding: "invalid" -HINT: Valid encodings are "base64", "base64url", "escape", and "hex". +HINT: Valid encodings are "base32hex", "base64", "base64url", "escape", and "hex". -- --- base64url encoding/decoding +-- base32hex encoding/decoding -- SET bytea_output TO hex; +SELECT encode('', 'base32hex'); -- '' + encode +-------- + +(1 row) + +SELECT encode('\x11', 'base32hex'); -- '24======' + encode +---------- + 24====== +(1 row) + +SELECT encode('\x1122', 'base32hex'); -- '24H0====' + encode +---------- + 24H0==== +(1 row) + +SELECT encode('\x112233', 'base32hex'); -- '24H36===' + encode +---------- + 24H36=== +(1 row) + +SELECT encode('\x11223344', 'base32hex'); -- '24H36H0=' + encode +---------- + 24H36H0= +(1 row) + +SELECT encode('\x1122334455', 'base32hex'); -- '24H36H2L' + encode +---------- + 24H36H2L +(1 row) + +SELECT encode('\x112233445566', 'base32hex'); -- '24H36H2LCO======' + encode +------------------ + 24H36H2LCO====== +(1 row) + +SELECT decode('', 'base32hex'); -- '' + decode +-------- + \x +(1 row) + +SELECT decode('24======', 'base32hex'); -- \x11 + decode +-------- + \x11 +(1 row) + +SELECT decode('24H0====', 'base32hex'); -- \x1122 + decode +-------- + \x1122 +(1 row) + +SELECT decode('24H36===', 'base32hex'); -- \x112233 + decode +---------- + \x112233 +(1 row) + +SELECT decode('24H36H0=', 'base32hex'); -- \x11223344 + decode +------------ + \x11223344 +(1 row) + +SELECT decode('24H36H2L', 'base32hex'); -- \x1122334455 + decode +-------------- + \x1122334455 +(1 row) + +SELECT decode('24H36H2LCO======', 'base32hex'); -- \x112233445566 + decode +---------------- + \x112233445566 +(1 row) + +SELECT decode('24h36h2lco', 'base32hex'); -- OK, the encoding is case-insensitive + decode +---------------- + \x112233445566 +(1 row) + +-- Tests for decoding unpadded base32hex strings. Padding '=' are optional. +SELECT decode('24', 'base32hex'); + decode +-------- + \x11 +(1 row) + +SELECT decode('24H', 'base32hex'); + decode +-------- + \x11 +(1 row) + +SELECT decode('24H36', 'base32hex'); + decode +---------- + \x112233 +(1 row) + +SELECT decode('24H36H0', 'base32hex'); + decode +------------ + \x11223344 +(1 row) + +SELECT decode('2', 'base32hex'); -- \x, 5 bits isn't enough for a byte, so nothing is emitted + decode +-------- + \x +(1 row) + +SELECT decode('11=', 'base32hex'); -- OK, non-zero padding bits are accepted (consistent with base64) + decode +-------- + \x08 +(1 row) + +SELECT decode('2=', 'base32hex'); -- error +ERROR: unexpected "=" while decoding base32hex sequence +SELECT decode('=', 'base32hex'); -- error +ERROR: unexpected "=" while decoding base32hex sequence +SELECT decode('W', 'base32hex'); -- error +ERROR: invalid symbol "W" found while decoding base32hex sequence +SELECT decode('24H36H0=24', 'base32hex'); -- error +ERROR: invalid symbol "2" found while decoding base32hex sequence +-- Check round-trip capability of base32hex encoding for multiple random UUIDs. +DO $$ +DECLARE + v1 uuid; + v2 uuid; +BEGIN + FOR i IN 1..10 LOOP + v1 := gen_random_uuid(); + v2 := decode(encode(v1::bytea, 'base32hex'), 'base32hex')::uuid; + + IF v1 != v2 THEN + RAISE EXCEPTION 'base32hex encoding round-trip failed, expected % got %', v1, v2; + END IF; + END LOOP; + RAISE NOTICE 'OK'; +END; +$$; +NOTICE: OK +-- +-- base64url encoding/decoding +-- -- Simple encoding/decoding SELECT encode('\x69b73eff', 'base64url'); -- abc-_w encode diff --git a/src/test/regress/expected/uuid.out b/src/test/regress/expected/uuid.out index d157ef7d0b3..142c529e693 100644 --- a/src/test/regress/expected/uuid.out +++ b/src/test/regress/expected/uuid.out @@ -13,7 +13,8 @@ CREATE TABLE guid2 CREATE TABLE guid3 ( id SERIAL, - guid_field UUID + guid_field UUID, + guid_encoded text GENERATED ALWAYS AS (encode(guid_field::bytea, 'base32hex')) STORED ); -- inserting invalid data tests -- too long @@ -226,11 +227,20 @@ SELECT count(DISTINCT guid_field) FROM guid1; (1 row) -- test sortability of v7 +INSERT INTO guid3 (guid_field) VALUES ('00000000-0000-0000-0000-000000000000'::uuid); INSERT INTO guid3 (guid_field) SELECT uuidv7() FROM generate_series(1, 10); +INSERT INTO guid3 (guid_field) VALUES ('ffffffff-ffff-ffff-ffff-ffffffffffff'::uuid); SELECT array_agg(id ORDER BY guid_field) FROM guid3; - array_agg ------------------------- - {1,2,3,4,5,6,7,8,9,10} + array_agg +------------------------------ + {1,2,3,4,5,6,7,8,9,10,11,12} +(1 row) + +-- make sure base32hex encoding works with UUIDs and preserves ordering +SELECT array_agg(id ORDER BY guid_encoded) FROM guid3; + array_agg +------------------------------ + {1,2,3,4,5,6,7,8,9,10,11,12} (1 row) -- Check the timestamp offsets for v7. diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index d8a09737668..5ae0e7da31a 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -835,10 +835,65 @@ SELECT encode('\x01'::bytea, 'invalid'); -- error SELECT decode('00', 'invalid'); -- error -- --- base64url encoding/decoding +-- base32hex encoding/decoding -- SET bytea_output TO hex; +SELECT encode('', 'base32hex'); -- '' +SELECT encode('\x11', 'base32hex'); -- '24======' +SELECT encode('\x1122', 'base32hex'); -- '24H0====' +SELECT encode('\x112233', 'base32hex'); -- '24H36===' +SELECT encode('\x11223344', 'base32hex'); -- '24H36H0=' +SELECT encode('\x1122334455', 'base32hex'); -- '24H36H2L' +SELECT encode('\x112233445566', 'base32hex'); -- '24H36H2LCO======' + +SELECT decode('', 'base32hex'); -- '' +SELECT decode('24======', 'base32hex'); -- \x11 +SELECT decode('24H0====', 'base32hex'); -- \x1122 +SELECT decode('24H36===', 'base32hex'); -- \x112233 +SELECT decode('24H36H0=', 'base32hex'); -- \x11223344 +SELECT decode('24H36H2L', 'base32hex'); -- \x1122334455 +SELECT decode('24H36H2LCO======', 'base32hex'); -- \x112233445566 + +SELECT decode('24h36h2lco', 'base32hex'); -- OK, the encoding is case-insensitive + +-- Tests for decoding unpadded base32hex strings. Padding '=' are optional. +SELECT decode('24', 'base32hex'); +SELECT decode('24H', 'base32hex'); +SELECT decode('24H36', 'base32hex'); +SELECT decode('24H36H0', 'base32hex'); + +SELECT decode('2', 'base32hex'); -- \x, 5 bits isn't enough for a byte, so nothing is emitted +SELECT decode('11=', 'base32hex'); -- OK, non-zero padding bits are accepted (consistent with base64) + +SELECT decode('2=', 'base32hex'); -- error +SELECT decode('=', 'base32hex'); -- error +SELECT decode('W', 'base32hex'); -- error +SELECT decode('24H36H0=24', 'base32hex'); -- error + +-- Check round-trip capability of base32hex encoding for multiple random UUIDs. +DO $$ +DECLARE + v1 uuid; + v2 uuid; +BEGIN + FOR i IN 1..10 LOOP + v1 := gen_random_uuid(); + v2 := decode(encode(v1::bytea, 'base32hex'), 'base32hex')::uuid; + + IF v1 != v2 THEN + RAISE EXCEPTION 'base32hex encoding round-trip failed, expected % got %', v1, v2; + END IF; + END LOOP; + RAISE NOTICE 'OK'; +END; +$$; + + +-- +-- base64url encoding/decoding +-- + -- Simple encoding/decoding SELECT encode('\x69b73eff', 'base64url'); -- abc-_w SELECT decode('abc-_w', 'base64url'); -- \x69b73eff diff --git a/src/test/regress/sql/uuid.sql b/src/test/regress/sql/uuid.sql index f512f4dea1d..f2ff00f5ddd 100644 --- a/src/test/regress/sql/uuid.sql +++ b/src/test/regress/sql/uuid.sql @@ -13,7 +13,8 @@ CREATE TABLE guid2 CREATE TABLE guid3 ( id SERIAL, - guid_field UUID + guid_field UUID, + guid_encoded text GENERATED ALWAYS AS (encode(guid_field::bytea, 'base32hex')) STORED ); -- inserting invalid data tests @@ -116,9 +117,14 @@ INSERT INTO guid1 (guid_field) VALUES (uuidv7(INTERVAL '1 day')); SELECT count(DISTINCT guid_field) FROM guid1; -- test sortability of v7 +INSERT INTO guid3 (guid_field) VALUES ('00000000-0000-0000-0000-000000000000'::uuid); INSERT INTO guid3 (guid_field) SELECT uuidv7() FROM generate_series(1, 10); +INSERT INTO guid3 (guid_field) VALUES ('ffffffff-ffff-ffff-ffff-ffffffffffff'::uuid); SELECT array_agg(id ORDER BY guid_field) FROM guid3; +-- make sure base32hex encoding works with UUIDs and preserves ordering +SELECT array_agg(id ORDER BY guid_encoded) FROM guid3; + -- Check the timestamp offsets for v7. -- -- generate UUIDv7 values with timestamps ranging from 1970 (the Unix epoch year)