gh-146192: Add base32 support to binascii (GH-146193)

author kangtastic <942136+kangtastic@users.noreply.github.com>

Sun, 22 Mar 2026 21:10:28 +0000 (14:10 -0700)

committer GitHub <noreply@github.com>

Sun, 22 Mar 2026 21:10:28 +0000 (23:10 +0200)
author kangtastic <942136+kangtastic@users.noreply.github.com>
Sun, 22 Mar 2026 21:10:28 +0000 (14:10 -0700)
committer GitHub <noreply@github.com>
Sun, 22 Mar 2026 21:10:28 +0000 (23:10 +0200)
diff --git a/Doc/library/binascii.rst b/Doc/library/binascii.rst

index 70ba036756ff32637c6a13c4181baf81e006fc76..64c1ce948d2d322fbb4700f55d2ec26adaad250b 100644 (file)
--- a/Doc/library/binascii.rst
+++ b/Doc/library/binascii.rst
@@ -183,6 +183,38 @@ The :mod:`!binascii` module defines the following functions:
     .. versionadded:: 3.15
  
  
+.. function:: a2b_base32(string, /, *, alphabet=BASE32_ALPHABET)
+
+   Convert base32 data back to binary and return the binary data.
+
+   Valid base32 data contains characters from the base32 alphabet specified
+   in :rfc:`4648` in groups of eight (if necessary, the final group is padded
+   to eight characters with ``=``). Each group encodes 40 bits of binary data
+   in the range from ``0`` to ``2 ** 40 - 1``, inclusive.
+
+   .. note::
+      This function does not map lowercase characters (which are invalid in
+      standard base32) to their uppercase counterparts, nor does it
+      contextually map ``0`` to ``O`` and ``1`` to ``I``/``L`` as :rfc:`4648`
+      allows.
+
+   Optional *alphabet* must be a :class:`bytes` object of length 32 which
+   specifies an alternative alphabet.
+
+   Invalid base32 data will raise :exc:`binascii.Error`.
+
+   .. versionadded:: next
+
+.. function:: b2a_base32(data, /, *, alphabet=BASE32_ALPHABET)
+
+   Convert binary data to a line of ASCII characters in base32 coding,
+   as specified in :rfc:`4648`. The return value is the converted line.
+
+   Optional *alphabet* must be a :term:`bytes-like object` of length 32 which
+   specifies an alternative alphabet.
+
+   .. versionadded:: next
+
  .. function:: a2b_qp(data, header=False)
  
     Convert a block of quoted-printable data back to binary and return the binary
@@ -327,6 +359,20 @@ The :mod:`!binascii` module defines the following functions:
  
     .. versionadded:: next
  
+.. data:: BASE32_ALPHABET
+
+   The Base 32 alphabet according to :rfc:`4648`.
+
+   .. versionadded:: next
+
+.. data:: BASE32HEX_ALPHABET
+
+   The "Extended Hex" Base 32 alphabet according to :rfc:`4648`.
+   Data encoded with this alphabet maintains its sort order during bitwise
+   comparisons.
+
+   .. versionadded:: next
+
  
  .. seealso::
  
diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst

index 5e6265a45231db92e96890d4624a8e1140cca1b6..b40c75060a43365d964ea3b9164a53992c6886f4 100644 (file)
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -662,6 +662,12 @@ binascii
  * Added the *ignorechars* parameter in :func:`~binascii.a2b_base64`.
    (Contributed by Serhiy Storchaka in :gh:`144001`.)
  
+* Added functions for Base32 encoding:
+
+  - :func:`~binascii.b2a_base32` and :func:`~binascii.a2b_base32`
+
+  (Contributed by James Seo in :gh:`146192`.)
+
  
  calendar
  --------
@@ -1279,6 +1285,10 @@ base64 & binascii
    two orders of magnitude less memory.
    (Contributed by James Seo and Serhiy Storchaka in :gh:`101178`.)
  
+* Implementation for Base32 has been rewritten in C.
+  Encoding and decoding is now two orders of magnitude faster.
+  (Contributed by James Seo in :gh:`146192`)
+
  
  csv
  ---
diff --git a/Lib/base64.py b/Lib/base64.py

index a429760da79f2a1d7bdc19f2c6bdb5a318f3afa4..9b57cdfefce1e6e4a250d2a137985c86668d5a41 100644 (file)
--- a/Lib/base64.py
+++ b/Lib/base64.py
@@ -206,54 +206,13 @@ mapped to (when map01 is not None, the digit 0 is always mapped to
  the letter O).  For security purposes the default is None, so that
  0 and 1 are not allowed in the input.
  '''
-_b32alphabet = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567'
-_b32hexalphabet = b'0123456789ABCDEFGHIJKLMNOPQRSTUV'
-_b32tab2 = {}
-_b32rev = {}
-
-def _b32encode(alphabet, s):
-    # Delay the initialization of the table to not waste memory
-    # if the function is never called
-    if alphabet not in _b32tab2:
-        b32tab = [bytes((i,)) for i in alphabet]
-        _b32tab2[alphabet] = [a + b for a in b32tab for b in b32tab]
-        b32tab = None
-
-    if not isinstance(s, bytes_types):
-        s = memoryview(s).tobytes()
-    leftover = len(s) % 5
-    # Pad the last quantum with zero bits if necessary
-    if leftover:
-        s = s + b'\0' * (5 - leftover)  # Don't use += !
-    encoded = bytearray()
-    from_bytes = int.from_bytes
-    b32tab2 = _b32tab2[alphabet]
-    for i in range(0, len(s), 5):
-        c = from_bytes(s[i: i + 5])              # big endian
-        encoded += (b32tab2[c >> 30] +           # bits 1 - 10
-                    b32tab2[(c >> 20) & 0x3ff] + # bits 11 - 20
-                    b32tab2[(c >> 10) & 0x3ff] + # bits 21 - 30
-                    b32tab2[c & 0x3ff]           # bits 31 - 40
-                   )
-    # Adjust for any leftover partial quanta
-    if leftover == 1:
-        encoded[-6:] = b'======'
-    elif leftover == 2:
-        encoded[-4:] = b'===='
-    elif leftover == 3:
-        encoded[-3:] = b'==='
-    elif leftover == 4:
-        encoded[-1:] = b'='
-    return encoded.take_bytes()
-
-def _b32decode(alphabet, s, casefold=False, map01=None):
-    # Delay the initialization of the table to not waste memory
-    # if the function is never called
-    if alphabet not in _b32rev:
-        _b32rev[alphabet] = {v: k for k, v in enumerate(alphabet)}
+
+def b32encode(s):
+    return binascii.b2a_base32(s)
+b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32')
+
+def b32decode(s, casefold=False, map01=None):
      s = _bytes_from_decode_data(s)
-    if len(s) % 8:
-        raise binascii.Error('Incorrect padding')
      # Handle section 2.4 zero and one mapping.  The flag map01 will be either
      # False, or the character to map the digit 1 (one) to.  It should be
      # either L (el) or I (eye).
@@ -263,51 +222,20 @@ def _b32decode(alphabet, s, casefold=False, map01=None):
          s = s.translate(bytes.maketrans(b'01', b'O' + map01))
      if casefold:
          s = s.upper()
-    # Strip off pad characters from the right.  We need to count the pad
-    # characters because this will tell us how many null bytes to remove from
-    # the end of the decoded string.
-    l = len(s)
-    s = s.rstrip(b'=')
-    padchars = l - len(s)
-    # Now decode the full quanta
-    decoded = bytearray()
-    b32rev = _b32rev[alphabet]
-    for i in range(0, len(s), 8):
-        quanta = s[i: i + 8]
-        acc = 0
-        try:
-            for c in quanta:
-                acc = (acc << 5) + b32rev[c]
-        except KeyError:
-            raise binascii.Error('Non-base32 digit found') from None
-        decoded += acc.to_bytes(5)  # big endian
-    # Process the last, partial quanta
-    if l % 8 or padchars not in {0, 1, 3, 4, 6}:
-        raise binascii.Error('Incorrect padding')
-    if padchars and decoded:
-        acc <<= 5 * padchars
-        last = acc.to_bytes(5)  # big endian
-        leftover = (43 - 5 * padchars) // 8  # 1: 4, 3: 3, 4: 2, 6: 1
-        decoded[-5:] = last[:leftover]
-    return decoded.take_bytes()
-
-
-def b32encode(s):
-    return _b32encode(_b32alphabet, s)
-b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32')
-
-def b32decode(s, casefold=False, map01=None):
-    return _b32decode(_b32alphabet, s, casefold, map01)
+    return binascii.a2b_base32(s)
  b32decode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32',
                                          extra_args=_B32_DECODE_MAP01_DOCSTRING)
  
  def b32hexencode(s):
-    return _b32encode(_b32hexalphabet, s)
+    return binascii.b2a_base32(s, alphabet=binascii.BASE32HEX_ALPHABET)
  b32hexencode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32hex')
  
  def b32hexdecode(s, casefold=False):
+    s = _bytes_from_decode_data(s)
      # base32hex does not have the 01 mapping
-    return _b32decode(_b32hexalphabet, s, casefold)
+    if casefold:
+        s = s.upper()
+    return binascii.a2b_base32(s, alphabet=binascii.BASE32HEX_ALPHABET)
  b32hexdecode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32hex',
                                                      extra_args='')
  
diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py

index 667ec9b5241aa9b044acdd8968c1b27e630453d1..d4879667c71461dbc4dd582c2d54295a03b09e3c 100644 (file)
--- a/Lib/test/test_binascii.py
+++ b/Lib/test/test_binascii.py
@@ -10,10 +10,10 @@ from test.support.hypothesis_helper import hypothesis
  
  
  # Note: "*_hex" functions are aliases for "(un)hexlify"
-b2a_functions = ['b2a_ascii85', 'b2a_base64', 'b2a_base85',
+b2a_functions = ['b2a_ascii85', 'b2a_base32', 'b2a_base64', 'b2a_base85',
                   'b2a_hex', 'b2a_qp', 'b2a_uu',
                   'hexlify']
-a2b_functions = ['a2b_ascii85', 'a2b_base64', 'a2b_base85',
+a2b_functions = ['a2b_ascii85', 'a2b_base32', 'a2b_base64', 'a2b_base85',
                   'a2b_hex', 'a2b_qp', 'a2b_uu',
                   'unhexlify']
  all_functions = a2b_functions + b2a_functions + ['crc32', 'crc_hqx']
@@ -75,6 +75,11 @@ class BinASCIITest(unittest.TestCase):
                           b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                           b'.-:+=^!/*?&<>()[]{}@%$#')
  
+        self.assertEqual(binascii.BASE32_ALPHABET,
+                         b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567')
+        self.assertEqual(binascii.BASE32HEX_ALPHABET,
+                         b'0123456789ABCDEFGHIJKLMNOPQRSTUV')
+
      def test_functions(self):
          # Check presence of all functions
          for name in all_functions:
@@ -670,6 +675,183 @@ class BinASCIITest(unittest.TestCase):
          with self.assertRaises(TypeError):
              binascii.a2b_base64(data, alphabet=bytearray(alphabet))
  
+    def test_base32_valid(self):
+        # Test base32 with valid data
+        lines = []
+        step = 0
+        i = 0
+        while i < len(self.rawdata):
+            b = self.type2test(self.rawdata[i:i + step])
+            a = binascii.b2a_base32(b)
+            lines.append(a)
+            i += step
+            step += 1
+        res = bytes()
+        for line in lines:
+            a = self.type2test(line)
+            b = binascii.a2b_base32(a)
+            res += b
+        self.assertEqual(res, self.rawdata)
+
+    def test_base32_errors(self):
+        def _fixPadding(data):
+            fixed = data.replace(b"=", b"")
+            len_8 = len(fixed) % 8
+            p = 8 - len_8 if len_8 else 0
+            return fixed + b"=" * p
+
+        def _assertRegexTemplate(assert_regex, data, good_padding_result=None):
+            with self.assertRaisesRegex(binascii.Error, assert_regex):
+                binascii.a2b_base32(self.type2test(data))
+            if good_padding_result:
+                fixed = self.type2test(_fixPadding(data))
+                self.assertEqual(binascii.a2b_base32(fixed), good_padding_result)
+
+        def assertNonBase32Data(*args):
+            _assertRegexTemplate(r"(?i)Only base32 data", *args)
+
+        def assertExcessData(*args):
+            _assertRegexTemplate(r"(?i)Excess data", *args)
+
+        def assertExcessPadding(*args):
+            _assertRegexTemplate(r"(?i)Excess padding", *args)
+
+        def assertLeadingPadding(*args):
+            _assertRegexTemplate(r"(?i)Leading padding", *args)
+
+        def assertIncorrectPadding(*args):
+            _assertRegexTemplate(r"(?i)Incorrect padding", *args)
+
+        def assertDiscontinuousPadding(*args):
+            _assertRegexTemplate(r"(?i)Discontinuous padding", *args)
+
+        def assertInvalidLength(*args):
+            _assertRegexTemplate(r"(?i)Invalid.+number of data characters", *args)
+
+        assertNonBase32Data(b"a")
+        assertNonBase32Data(b"AA-")
+        assertNonBase32Data(b"ABCDE==!")
+        assertNonBase32Data(b"ab:(){:|:&};:==")
+
+        assertExcessData(b"AB======C")
+        assertExcessData(b"AB======CD")
+        assertExcessData(b"ABCD====E")
+        assertExcessData(b"ABCDE===FGH")
+        assertExcessData(b"ABCDEFG=H")
+        assertExcessData(b"432Z====55555555")
+
+        assertExcessData(b"BE======EF", b"\t\x08")
+        assertExcessData(b"BEEF====C", b"\t\x08Q")
+        assertExcessData(b"BEEFC===AK", b"\t\x08Q\x01")
+        assertExcessData(b"BEEFCAK=E", b"\t\x08Q\x01D")
+
+        assertExcessPadding(b"BE=======", b"\t")
+        assertExcessPadding(b"BE========", b"\t")
+        assertExcessPadding(b"BEEF=====", b"\t\x08")
+        assertExcessPadding(b"BEEF======", b"\t\x08")
+        assertExcessPadding(b"BEEFC====", b"\t\x08Q")
+        assertExcessPadding(b"BEEFC=====", b"\t\x08Q")
+        assertExcessPadding(b"BEEFCAK==", b"\t\x08Q\x01")
+        assertExcessPadding(b"BEEFCAK===", b"\t\x08Q\x01")
+        assertExcessPadding(b"BEEFCAKE=", b"\t\x08Q\x01D")
+        assertExcessPadding(b"BEEFCAKE==", b"\t\x08Q\x01D")
+        assertExcessPadding(b"BEEFCAKE===", b"\t\x08Q\x01D")
+        assertExcessPadding(b"BEEFCAKE====", b"\t\x08Q\x01D")
+        assertExcessPadding(b"BEEFCAKE=====", b"\t\x08Q\x01D")
+        assertExcessPadding(b"BEEFCAKE======", b"\t\x08Q\x01D")
+        assertExcessPadding(b"BEEFCAKE=======", b"\t\x08Q\x01D")
+        assertExcessPadding(b"BEEFCAKE========", b"\t\x08Q\x01D")
+        assertExcessPadding(b"BEEFCAKE=========", b"\t\x08Q\x01D")
+
+        assertLeadingPadding(b"=", b"")
+        assertLeadingPadding(b"==", b"")
+        assertLeadingPadding(b"===", b"")
+        assertLeadingPadding(b"====", b"")
+        assertLeadingPadding(b"=====", b"")
+        assertLeadingPadding(b"======", b"")
+        assertLeadingPadding(b"=======", b"")
+        assertLeadingPadding(b"========", b"")
+        assertLeadingPadding(b"=========", b"")
+        assertLeadingPadding(b"=BEEFCAKE", b"\t\x08Q\x01D")
+        assertLeadingPadding(b"==BEEFCAKE", b"\t\x08Q\x01D")
+        assertLeadingPadding(b"===BEEFCAKE", b"\t\x08Q\x01D")
+        assertLeadingPadding(b"====BEEFCAKE", b"\t\x08Q\x01D")
+        assertLeadingPadding(b"=====BEEFCAKE", b"\t\x08Q\x01D")
+        assertLeadingPadding(b"======BEEFCAKE", b"\t\x08Q\x01D")
+        assertLeadingPadding(b"=======BEEFCAKE", b"\t\x08Q\x01D")
+        assertLeadingPadding(b"========BEEFCAKE", b"\t\x08Q\x01D")
+        assertLeadingPadding(b"=========BEEFCAKE", b"\t\x08Q\x01D")
+
+        assertIncorrectPadding(b"AB")
+        assertIncorrectPadding(b"ABCD")
+        assertIncorrectPadding(b"ABCDE")
+        assertIncorrectPadding(b"ABCDEFG")
+
+        assertIncorrectPadding(b"BE=", b"\t")
+        assertIncorrectPadding(b"BE==", b"\t")
+        assertIncorrectPadding(b"BE===", b"\t")
+        assertIncorrectPadding(b"BE====", b"\t")
+        assertIncorrectPadding(b"BE=====", b"\t")
+        assertIncorrectPadding(b"BEEF=", b"\t\x08")
+        assertIncorrectPadding(b"BEEF==", b"\t\x08")
+        assertIncorrectPadding(b"BEEF===", b"\t\x08")
+        assertIncorrectPadding(b"BEEFC=", b"\t\x08Q")
+        assertIncorrectPadding(b"BEEFC==", b"\t\x08Q")
+
+        assertDiscontinuousPadding(b"BE=EF===", b"\t\x08")
+        assertDiscontinuousPadding(b"BE==EF==", b"\t\x08")
+        assertDiscontinuousPadding(b"BEEF=C==", b"\t\x08Q")
+        assertDiscontinuousPadding(b"BEEFC=AK", b"\t\x08Q\x01")
+
+        assertInvalidLength(b"A")
+        assertInvalidLength(b"ABC")
+        assertInvalidLength(b"ABCDEF")
+
+        assertInvalidLength(b"A=")
+        assertInvalidLength(b"A==")
+        assertInvalidLength(b"A===")
+        assertInvalidLength(b"A====")
+        assertInvalidLength(b"A=====")
+        assertInvalidLength(b"A======")
+        assertInvalidLength(b"ABC=")
+        assertInvalidLength(b"ABC==")
+        assertInvalidLength(b"ABC===")
+        assertInvalidLength(b"ABC====")
+        assertInvalidLength(b"ABCDEF=")
+
+        assertInvalidLength(b"B=E=====", b"\t")
+        assertInvalidLength(b"B==E====", b"\t")
+        assertInvalidLength(b"BEE=F===", b"\t\x08")
+        assertInvalidLength(b"BEE==F==", b"\t\x08")
+        assertInvalidLength(b"BEEFCA=K", b"\t\x08Q\x01")
+        assertInvalidLength(b"BEEFCA=====K", b"\t\x08Q\x01")
+
+    def test_base32_alphabet(self):
+        alphabet = b'0Aa1Bb2Cc3Dd4Ee5Ff6Gg7Hh8Ii9JjKk'
+        data = self.type2test(self.rawdata)
+        encoded = binascii.b2a_base32(data, alphabet=alphabet)
+        trans = bytes.maketrans(binascii.BASE32_ALPHABET, alphabet)
+        expected = binascii.b2a_base32(data).translate(trans)
+        self.assertEqual(encoded, expected)
+        self.assertEqual(binascii.a2b_base32(encoded, alphabet=alphabet), self.rawdata)
+        self.assertEqual(binascii.b2a_base32(data, alphabet=self.type2test(alphabet)), expected)
+
+        data = self.type2test(b'')
+        self.assertEqual(binascii.b2a_base32(data, alphabet=alphabet), b'')
+        self.assertEqual(binascii.a2b_base32(data, alphabet=alphabet), b'')
+
+        for func in binascii.b2a_base32, binascii.a2b_base32:
+            with self.assertRaises(TypeError):
+                func(data, alphabet=None)
+            with self.assertRaises(TypeError):
+                func(data, alphabet=alphabet.decode())
+            with self.assertRaises(ValueError):
+                func(data, alphabet=alphabet[:-1])
+            with self.assertRaises(ValueError):
+                func(data, alphabet=alphabet+b'?')
+        with self.assertRaises(TypeError):
+            binascii.a2b_base32(data, alphabet=bytearray(alphabet))
+
      def test_uu(self):
          MAX_UU = 45
          for backtick in (True, False):
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst

new file mode 100644 (file)

index 0000000..304a7cd
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst
@@ -0,0 +1,2 @@
+Add Base32 support to :mod:`binascii` and improve the performance of the
+Base32 converters in :mod:`base64`. Patch by James Seo.
diff --git a/Modules/binascii.c b/Modules/binascii.c

index ebade54173d11b313fba6b075cea8e6363cbca08..7907b74e36f085c9cbf8dc961850d0030a66b031 100644 (file)
--- a/Modules/binascii.c
+++ b/Modules/binascii.c
@@ -244,6 +244,129 @@ static const unsigned char table_b2a_base85_a85[] Py_ALIGNED(64) =
  #define BASE85_A85_Z 0x00000000
  #define BASE85_A85_Y 0x20202020
  
+
+static const unsigned char table_a2b_base32[] Py_ALIGNED(64) = {
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,26,27, 28,29,30,31, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1, 0, 1, 2,  3, 4, 5, 6,  7, 8, 9,10, 11,12,13,14,
+    15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+    -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
+};
+
+static const unsigned char table_b2a_base32[] Py_ALIGNED(64) =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567";
+
+#define BASE32_PAD '='
+
+/*
+ * Fast base32 encoding/decoding helpers.
+ *
+ * Analogous to the helpers for base64.
+ */
+
+/* Encode 5 bytes into 8 base32 characters. */
+static inline void
+base32_encode_quint(const unsigned char *in, unsigned char *out,
+                    const unsigned char table[])
+{
+    uint64_t combined = ((uint64_t)in[0] << 32) |
+                        ((uint64_t)in[1] << 24) |
+                        ((uint64_t)in[2] << 16) |
+                        ((uint64_t)in[3] << 8) |
+                         (uint64_t)in[4];
+    out[0] = table[(combined >> 35) & 0x1f];
+    out[1] = table[(combined >> 30) & 0x1f];
+    out[2] = table[(combined >> 25) & 0x1f];
+    out[3] = table[(combined >> 20) & 0x1f];
+    out[4] = table[(combined >> 15) & 0x1f];
+    out[5] = table[(combined >> 10) & 0x1f];
+    out[6] = table[(combined >> 5) & 0x1f];
+    out[7] = table[combined & 0x1f];
+}
+
+/*
+ * Encode multiple complete 5-byte groups.
+ * Returns the number of input bytes processed (always a multiple of 5).
+ */
+static inline Py_ssize_t
+base32_encode_fast(const unsigned char *in, Py_ssize_t in_len,
+                   unsigned char *out, const unsigned char table[])
+{
+    Py_ssize_t n_quints = in_len / 5;
+    const unsigned char *in_end = in + n_quints * 5;
+
+    while (in < in_end) {
+        base32_encode_quint(in, out, table);
+        in += 5;
+        out += 8;
+    }
+
+    return n_quints * 5;
+}
+
+/*
+ * Decode 8 base32 characters into 5 bytes.
+ * Returns 1 on success, 0 if any character is invalid.
+ */
+static inline int
+base32_decode_octa(const unsigned char *in, unsigned char *out,
+                   const unsigned char table[])
+{
+    unsigned char v0 = table[in[0]];
+    unsigned char v1 = table[in[1]];
+    unsigned char v2 = table[in[2]];
+    unsigned char v3 = table[in[3]];
+    unsigned char v4 = table[in[4]];
+    unsigned char v5 = table[in[5]];
+    unsigned char v6 = table[in[6]];
+    unsigned char v7 = table[in[7]];
+
+    if ((v0 | v1 | v2 | v3 | v4 | v5 | v6 | v7) & 0xe0) {
+        return 0;
+    }
+
+    out[0] = (v0 << 3) | (v1 >> 2);
+    out[1] = (v1 << 6) | (v2 << 1) | (v3 >> 4);
+    out[2] = (v3 << 4) | (v4 >> 1);
+    out[3] = (v4 << 7) | (v5 << 2) | (v6 >> 3);
+    out[4] = (v6 << 5) | v7;
+    return 1;
+}
+
+/*
+ * Decode multiple complete 8-character groups (no padding allowed).
+ * Returns the number of input characters processed.
+ * Stops at the first invalid character, padding, or incomplete group.
+ */
+static inline Py_ssize_t
+base32_decode_fast(const unsigned char *in, Py_ssize_t in_len,
+                   unsigned char *out, const unsigned char table[])
+{
+    Py_ssize_t n_quints = in_len / 8;
+    Py_ssize_t i;
+
+    for (i = 0; i < n_quints; i++) {
+        if (!base32_decode_octa(in + i * 8, out + i * 5, table)) {
+            break;
+        }
+    }
+
+    return i * 8;
+}
+
+
  static const unsigned short crctab_hqx[256] = {
      0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
      0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef,
@@ -1367,6 +1490,298 @@ binascii_b2a_base85_impl(PyObject *module, Py_buffer *data, int pad,
      return PyBytesWriter_FinishWithPointer(writer, ascii_data);
  }
  
+/*[clinic input]
+binascii.a2b_base32
+
+    data: ascii_buffer
+    /
+    *
+    alphabet: PyBytesObject(c_default="NULL") = BASE32_ALPHABET
+
+Decode a line of base32 data.
+[clinic start generated code]*/
+
+static PyObject *
+binascii_a2b_base32_impl(PyObject *module, Py_buffer *data,
+                         PyBytesObject *alphabet)
+/*[clinic end generated code: output=12cb58bf547237e2 input=426055ea49ac147e]*/
+{
+    const unsigned char *ascii_data = data->buf;
+    Py_ssize_t ascii_len = data->len;
+    binascii_state *state = NULL;
+    PyObject *table_obj = NULL;
+    const unsigned char *table_a2b = table_a2b_base32;
+
+    assert(ascii_len >= 0);
+
+    if (alphabet != NULL) {
+        state = get_binascii_state(module);
+        table_obj = get_reverse_table(state, (PyObject *)alphabet, 32, BASE32_PAD);
+        if (table_obj == NULL) {
+            return NULL;
+        }
+        table_a2b = (const unsigned char *)PyBytes_AS_STRING(table_obj);
+    }
+
+    /* Allocate output buffer. */
+    size_t bin_len = ((size_t)ascii_len + 7) / 8 * 5;
+    PyBytesWriter *writer = PyBytesWriter_Create(bin_len);
+    if (writer == NULL) {
+        Py_XDECREF(table_obj);
+        return NULL;
+    }
+    unsigned char *bin_data = PyBytesWriter_GetData(writer);
+
+    /*
+     * Fast path: use optimized decoder for complete octas (groups of 8 bytes).
+     * The fast path stops at padding, invalid chars, or incomplete octas.
+     */
+    if (ascii_len >= 8) {
+        Py_ssize_t fast_chars = base32_decode_fast(ascii_data, ascii_len,
+                                                   bin_data, table_a2b);
+        if (fast_chars > 0) {
+            ascii_data += fast_chars;
+            ascii_len -= fast_chars;
+            bin_data += (fast_chars / 8) * 5;
+        }
+    }
+
+    /* Slow path: handle remaining input (padding, invalid chars, incomplete octas). */
+    unsigned char leftchar = 0;
+    int octa_pos = 0;
+    int pads = 0;
+    for (; ascii_len; ascii_len--, ascii_data++) {
+        unsigned char this_ch = *ascii_data;
+
+        /* Check for pad sequences. They may only occur at certain positions. */
+        if (this_ch == BASE32_PAD) {
+            pads++;
+
+            if ((octa_pos == 2 || octa_pos == 4 || octa_pos == 5 || octa_pos == 7)
+                && octa_pos + pads <= 8)
+            {
+                continue;
+            }
+            if (octa_pos == 1 || octa_pos == 3 || octa_pos == 6) {
+                /* Set an error below. */
+                break;
+            }
+            state = get_binascii_state(module);
+            if (state) {
+                PyErr_SetString(state->Error,
+                                (octa_pos == 0 && ascii_data == data->buf)
+                                ? "Leading padding not allowed"
+                                : "Excess padding not allowed");
+            }
+            goto error;
+        }
+
+        unsigned char v = table_a2b[this_ch];
+        if (v >= 32) {
+            state = get_binascii_state(module);
+            if (state) {
+                PyErr_SetString(state->Error, "Only base32 data is allowed");
+            }
+            goto error;
+        }
+
+        /* Data in the middle of/after the padding is not allowed. */
+        if (pads) {
+            state = get_binascii_state(module);
+            if (state) {
+                PyErr_SetString(state->Error, (octa_pos + pads == 8)
+                                ? "Excess data after padding"
+                                : "Discontinuous padding not allowed");
+            }
+            goto error;
+        }
+
+        switch (octa_pos) {
+            case 0:
+                octa_pos = 1;
+                leftchar = v;
+                break;
+            case 1:
+                octa_pos = 2;
+                *bin_data++ = (leftchar << 3) | (v >> 2);
+                leftchar = v & 0x03;
+                break;
+            case 2:
+                octa_pos = 3;
+                leftchar = (leftchar << 5) | v;
+                break;
+            case 3:
+                octa_pos = 4;
+                *bin_data++ = (leftchar << 1) | (v >> 4);
+                leftchar = v & 0x0f;
+                break;
+            case 4:
+                octa_pos = 5;
+                *bin_data++ = (leftchar << 4) | (v >> 1);
+                leftchar = v & 0x01;
+                break;
+            case 5:
+                octa_pos = 6;
+                leftchar = (leftchar << 5) | v;
+                break;
+            case 6:
+                octa_pos = 7;
+                *bin_data++ = (leftchar << 2) | (v >> 3);
+                leftchar = v & 0x07;
+                break;
+            case 7:
+                octa_pos = 0;
+                *bin_data++ = (leftchar << 5) | v;
+                leftchar = 0;
+        }
+    }
+
+    if (octa_pos == 1 || octa_pos == 3 || octa_pos == 6) {
+        state = get_binascii_state(module);
+        if (state) {
+            const unsigned char *ascii_data_start = data->buf;
+            PyErr_Format(state->Error,
+                         "Invalid base32-encoded string: "
+                         "number of data characters (%zd) "
+                         "cannot be 1, 3, or 6 more than a multiple of 8",
+                         ascii_data - ascii_data_start);
+        }
+        goto error;
+    }
+
+    if ((octa_pos != 0 && octa_pos + pads != 8)
+        || (octa_pos == 0 && pads != 0))
+    {
+        state = get_binascii_state(module);
+        if (state) {
+            PyErr_SetString(state->Error, "Incorrect padding");
+        }
+        goto error;
+    }
+
+    Py_XDECREF(table_obj);
+    return PyBytesWriter_FinishWithPointer(writer, bin_data);
+
+error:
+    PyBytesWriter_Discard(writer);
+    Py_XDECREF(table_obj);
+    return NULL;
+}
+
+/*[clinic input]
+binascii.b2a_base32
+
+    data: Py_buffer
+    /
+    *
+    alphabet: Py_buffer(c_default="{NULL, NULL}") = BASE32_ALPHABET
+
+Base32-code line of data.
+[clinic start generated code]*/
+
+static PyObject *
+binascii_b2a_base32_impl(PyObject *module, Py_buffer *data,
+                         Py_buffer *alphabet)
+/*[clinic end generated code: output=058d0d1aeb014d3b input=99cbe7194799d368]*/
+{
+    const unsigned char *table_b2a = table_b2a_base32;
+    const unsigned char *bin_data = data->buf;
+    Py_ssize_t bin_len = data->len;
+    binascii_state *state = NULL;
+
+    assert(bin_len >= 0);
+
+    if (alphabet->buf != NULL) {
+        if (alphabet->len != 32) {
+            PyErr_SetString(PyExc_ValueError, "alphabet must have length 32");
+            return NULL;
+        }
+        table_b2a = alphabet->buf;
+    }
+
+    /*
+     * Each group of 5 bytes (rounded up) gets encoded as 8 characters.
+     * Use unsigned integer arithmetic to avoid signed integer overflow.
+     */
+    size_t ascii_len = ((size_t)bin_len + 4u) / 5u * 8u;
+    if (ascii_len > PY_SSIZE_T_MAX) {
+        state = get_binascii_state(module);
+        if (state) {
+            PyErr_SetString(state->Error, "Too much data for base32");
+        }
+        return NULL;
+    }
+    PyBytesWriter *writer = PyBytesWriter_Create(ascii_len);
+    if (writer == NULL) {
+        return NULL;
+    }
+    unsigned char *ascii_data = PyBytesWriter_GetData(writer);
+
+    /* Use the optimized fast path for complete 5-byte groups. */
+    Py_ssize_t fast_bytes = base32_encode_fast(bin_data, bin_len, ascii_data,
+                                               table_b2a);
+    bin_data += fast_bytes;
+    ascii_data += (fast_bytes / 5) * 8;
+    bin_len -= fast_bytes;
+
+    /* Handle the remaining 0-4 bytes. */
+    if (bin_len == 1) {
+        /* 1 byte remaining: produces 2 encoded + 6 padding chars. */
+        uint32_t val = bin_data[0];
+        *ascii_data++ = table_b2a[(val >> 3) & 0x1f];
+        *ascii_data++ = table_b2a[(val << 2) & 0x1f];
+        *ascii_data++ = BASE32_PAD;
+        *ascii_data++ = BASE32_PAD;
+        *ascii_data++ = BASE32_PAD;
+        *ascii_data++ = BASE32_PAD;
+        *ascii_data++ = BASE32_PAD;
+        *ascii_data++ = BASE32_PAD;
+    }
+    else if (bin_len == 2) {
+        /* 2 bytes remaining: produces 4 encoded + 4 padding chars. */
+        uint32_t val = ((uint32_t)bin_data[0] << 8) | bin_data[1];
+        *ascii_data++ = table_b2a[(val >> 11) & 0x1f];
+        *ascii_data++ = table_b2a[(val >> 6) & 0x1f];
+        *ascii_data++ = table_b2a[(val >> 1) & 0x1f];
+        *ascii_data++ = table_b2a[(val << 4) & 0x1f];
+        *ascii_data++ = BASE32_PAD;
+        *ascii_data++ = BASE32_PAD;
+        *ascii_data++ = BASE32_PAD;
+        *ascii_data++ = BASE32_PAD;
+    }
+    else if (bin_len == 3) {
+        /* 3 bytes remaining: produces 5 encoded + 3 padding chars. */
+        uint32_t val = ((uint32_t)bin_data[0] << 16)
+                       | ((uint32_t)bin_data[1] << 8)
+                       | bin_data[2];
+        *ascii_data++ = table_b2a[(val >> 19) & 0x1f];
+        *ascii_data++ = table_b2a[(val >> 14) & 0x1f];
+        *ascii_data++ = table_b2a[(val >> 9) & 0x1f];
+        *ascii_data++ = table_b2a[(val >> 4) & 0x1f];
+        *ascii_data++ = table_b2a[(val << 1) & 0x1f];
+        *ascii_data++ = BASE32_PAD;
+        *ascii_data++ = BASE32_PAD;
+        *ascii_data++ = BASE32_PAD;
+    }
+    else if (bin_len == 4) {
+        /* 4 bytes remaining: produces 7 encoded + 1 padding chars. */
+        uint32_t val = ((uint32_t)bin_data[0] << 24)
+                       | ((uint32_t)bin_data[1] << 16)
+                       | ((uint32_t)bin_data[2] << 8)
+                       | bin_data[3];
+        *ascii_data++ = table_b2a[(val >> 27) & 0x1f];
+        *ascii_data++ = table_b2a[(val >> 22) & 0x1f];
+        *ascii_data++ = table_b2a[(val >> 17) & 0x1f];
+        *ascii_data++ = table_b2a[(val >> 12) & 0x1f];
+        *ascii_data++ = table_b2a[(val >> 7) & 0x1f];
+        *ascii_data++ = table_b2a[(val >> 2) & 0x1f];
+        *ascii_data++ = table_b2a[(val << 3) & 0x1f];
+        *ascii_data++ = BASE32_PAD;
+    }
+
+    return PyBytesWriter_FinishWithPointer(writer, ascii_data);
+}
+
  /*[clinic input]
  binascii.crc_hqx
  
@@ -2028,6 +2443,8 @@ static struct PyMethodDef binascii_module_methods[] = {
      BINASCII_A2B_ASCII85_METHODDEF
      BINASCII_A2B_BASE85_METHODDEF
      BINASCII_B2A_BASE85_METHODDEF
+    BINASCII_A2B_BASE32_METHODDEF
+    BINASCII_B2A_BASE32_METHODDEF
      BINASCII_A2B_HEX_METHODDEF
      BINASCII_B2A_HEX_METHODDEF
      BINASCII_HEXLIFY_METHODDEF
@@ -2114,6 +2531,16 @@ binascii_exec(PyObject *module)
      {
          return -1;
      }
+    if (PyModule_Add(module, "BASE32_ALPHABET",
+        PyBytes_FromStringAndSize((const char *)table_b2a_base32, 32)) < 0)
+    {
+        return -1;
+    }
+    if (PyModule_Add(module, "BASE32HEX_ALPHABET",
+        PyBytes_FromString("0123456789ABCDEFGHIJKLMNOPQRSTUV")) < 0)
+    {
+        return -1;
+    }
  
      state->reverse_table_cache = PyDict_New();
      if (state->reverse_table_cache == NULL) {
diff --git a/Modules/clinic/binascii.c.h b/Modules/clinic/binascii.c.h

index 2fdecc2efbf9d4275aecfb1e045c159a160e573e..7a411bfc8299431d1f36088ff9d58a50651301d0 100644 (file)
--- a/Modules/clinic/binascii.c.h
+++ b/Modules/clinic/binascii.c.h
@@ -711,6 +711,161 @@ exit:
      return return_value;
  }
  
+PyDoc_STRVAR(binascii_a2b_base32__doc__,
+"a2b_base32($module, data, /, *, alphabet=BASE32_ALPHABET)\n"
+"--\n"
+"\n"
+"Decode a line of base32 data.");
+
+#define BINASCII_A2B_BASE32_METHODDEF    \
+    {"a2b_base32", _PyCFunction_CAST(binascii_a2b_base32), METH_FASTCALL|METH_KEYWORDS, binascii_a2b_base32__doc__},
+
+static PyObject *
+binascii_a2b_base32_impl(PyObject *module, Py_buffer *data,
+                         PyBytesObject *alphabet);
+
+static PyObject *
+binascii_a2b_base32(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 1
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(alphabet), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"", "alphabet", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "a2b_base32",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
+    Py_buffer data = {NULL, NULL};
+    PyBytesObject *alphabet = NULL;
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (!ascii_buffer_converter(args[0], &data)) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_kwonly;
+    }
+    if (!PyBytes_Check(args[1])) {
+        _PyArg_BadArgument("a2b_base32", "argument 'alphabet'", "bytes", args[1]);
+        goto exit;
+    }
+    alphabet = (PyBytesObject *)args[1];
+skip_optional_kwonly:
+    return_value = binascii_a2b_base32_impl(module, &data, alphabet);
+
+exit:
+    /* Cleanup for data */
+    if (data.obj)
+       PyBuffer_Release(&data);
+
+    return return_value;
+}
+
+PyDoc_STRVAR(binascii_b2a_base32__doc__,
+"b2a_base32($module, data, /, *, alphabet=BASE32_ALPHABET)\n"
+"--\n"
+"\n"
+"Base32-code line of data.");
+
+#define BINASCII_B2A_BASE32_METHODDEF    \
+    {"b2a_base32", _PyCFunction_CAST(binascii_b2a_base32), METH_FASTCALL|METH_KEYWORDS, binascii_b2a_base32__doc__},
+
+static PyObject *
+binascii_b2a_base32_impl(PyObject *module, Py_buffer *data,
+                         Py_buffer *alphabet);
+
+static PyObject *
+binascii_b2a_base32(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames)
+{
+    PyObject *return_value = NULL;
+    #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+
+    #define NUM_KEYWORDS 1
+    static struct {
+        PyGC_Head _this_is_not_used;
+        PyObject_VAR_HEAD
+        Py_hash_t ob_hash;
+        PyObject *ob_item[NUM_KEYWORDS];
+    } _kwtuple = {
+        .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
+        .ob_hash = -1,
+        .ob_item = { &_Py_ID(alphabet), },
+    };
+    #undef NUM_KEYWORDS
+    #define KWTUPLE (&_kwtuple.ob_base.ob_base)
+
+    #else  // !Py_BUILD_CORE
+    #  define KWTUPLE NULL
+    #endif  // !Py_BUILD_CORE
+
+    static const char * const _keywords[] = {"", "alphabet", NULL};
+    static _PyArg_Parser _parser = {
+        .keywords = _keywords,
+        .fname = "b2a_base32",
+        .kwtuple = KWTUPLE,
+    };
+    #undef KWTUPLE
+    PyObject *argsbuf[2];
+    Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1;
+    Py_buffer data = {NULL, NULL};
+    Py_buffer alphabet = {NULL, NULL};
+
+    args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser,
+            /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
+    if (!args) {
+        goto exit;
+    }
+    if (PyObject_GetBuffer(args[0], &data, PyBUF_SIMPLE) != 0) {
+        goto exit;
+    }
+    if (!noptargs) {
+        goto skip_optional_kwonly;
+    }
+    if (PyObject_GetBuffer(args[1], &alphabet, PyBUF_SIMPLE) != 0) {
+        goto exit;
+    }
+skip_optional_kwonly:
+    return_value = binascii_b2a_base32_impl(module, &data, &alphabet);
+
+exit:
+    /* Cleanup for data */
+    if (data.obj) {
+       PyBuffer_Release(&data);
+    }
+    /* Cleanup for alphabet */
+    if (alphabet.obj) {
+       PyBuffer_Release(&alphabet);
+    }
+
+    return return_value;
+}
+
  PyDoc_STRVAR(binascii_crc_hqx__doc__,
  "crc_hqx($module, data, crc, /)\n"
  "--\n"
@@ -1256,4 +1411,4 @@ exit:
  
      return return_value;
  }
-/*[clinic end generated code: output=84c97096b0fb3819 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=242c0c56b918bd33 input=a9049054013a1b77]*/
author	kangtastic <942136+kangtastic@users.noreply.github.com>
	Sun, 22 Mar 2026 21:10:28 +0000 (14:10 -0700)
committer	GitHub <noreply@github.com>
	Sun, 22 Mar 2026 21:10:28 +0000 (23:10 +0200)
Doc/library/binascii.rst		patch \| blob \| blame \| history
Doc/whatsnew/3.15.rst		patch \| blob \| blame \| history
Lib/base64.py		patch \| blob \| blame \| history
Lib/test/test_binascii.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst	[new file with mode: 0644]	patch \| blob
Modules/binascii.c		patch \| blob \| blame \| history
Modules/clinic/binascii.c.h		patch \| blob \| blame \| history