]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
bpo-29882: Add _Py_popcount32() function (GH-20518)
authorVictor Stinner <vstinner@python.org>
Mon, 8 Jun 2020 14:30:33 +0000 (16:30 +0200)
committerGitHub <noreply@github.com>
Mon, 8 Jun 2020 14:30:33 +0000 (16:30 +0200)
* Rename pycore_byteswap.h to pycore_bitutils.h.
* Move popcount_digit() to pycore_bitutils.h as _Py_popcount32().
* _Py_popcount32() uses GCC and clang builtin function if available.
* Add unit tests to _Py_popcount32().

Include/internal/pycore_bitutils.h [moved from Include/internal/pycore_byteswap.h with 59% similarity]
Makefile.pre.in
Modules/_ctypes/cfield.c
Modules/_testinternalcapi.c
Modules/sha256module.c
Modules/sha512module.c
Objects/longobject.c
Objects/stringlib/codecs.h
PCbuild/pythoncore.vcxproj
PCbuild/pythoncore.vcxproj.filters
Python/hamt.c

similarity index 59%
rename from Include/internal/pycore_byteswap.h
rename to Include/internal/pycore_bitutils.h
index 5e64704a004c82a8296a7356d68919ab76485bf9..36ffe23b9ff26489ca9210128fc01b247c77d5a6 100644 (file)
@@ -1,4 +1,6 @@
-/* Bytes swap functions, reverse order of bytes:
+/* Bit and bytes utilities.
+
+   Bytes swap functions, reverse order of bytes:
 
    - _Py_bswap16(uint16_t)
    - _Py_bswap32(uint32_t)
@@ -82,6 +84,53 @@ _Py_bswap64(uint64_t word)
 }
 
 
+// Population count: count the number of 1's in 'x'
+// (number of bits set to 1), also known as the hamming weight.
+//
+// Implementation note. CPUID is not used, to test if x86 POPCNT instruction
+// can be used, to keep the implementation simple. For example, Visual Studio
+// __popcnt() is not used this reason. The clang and GCC builtin function can
+// use the x86 POPCNT instruction if the target architecture has SSE4a or
+// newer.
+static inline int
+_Py_popcount32(uint32_t x)
+{
+#if (defined(__clang__) || defined(__GNUC__))
+
+#if SIZEOF_INT >= 4
+    Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int));
+    return __builtin_popcount(x);
+#else
+    // The C standard guarantees that unsigned long will always be big enough
+    // to hold a uint32_t value without losing information.
+    Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long));
+    return __builtin_popcountl(x);
+#endif
+
+#else
+    // 32-bit SWAR (SIMD Within A Register) popcount
+
+    // Binary: 0 1 0 1 ...
+    const uint32_t M1 = 0x55555555;
+    // Binary: 00 11 00 11. ..
+    const uint32_t M2 = 0x33333333;
+    // Binary: 0000 1111 0000 1111 ...
+    const uint32_t M4 = 0x0F0F0F0F;
+    // 256**4 + 256**3 + 256**2 + 256**1
+    const uint32_t SUM = 0x01010101;
+
+    // Put count of each 2 bits into those 2 bits
+    x = x - ((x >> 1) & M1);
+    // Put count of each 4 bits into those 4 bits
+    x = (x & M2) + ((x >> 2) & M2);
+    // Put count of each 8 bits into those 8 bits
+    x = (x + (x >> 4)) & M4;
+    // Sum of the 4 byte counts
+    return (uint32_t)((uint64_t)x * (uint64_t)SUM) >> 24;
+#endif
+}
+
+
 #ifdef __cplusplus
 }
 #endif
index 5a18704e441985bd4f944f24e6aea6c04ffa1d0c..b115e7fc01f74c89042f801ea6d0371528f312b0 100644 (file)
@@ -1121,7 +1121,7 @@ PYTHON_HEADERS= \
                $(srcdir)/Include/internal/pycore_abstract.h \
                $(srcdir)/Include/internal/pycore_accu.h \
                $(srcdir)/Include/internal/pycore_atomic.h \
-               $(srcdir)/Include/internal/pycore_byteswap.h \
+               $(srcdir)/Include/internal/pycore_bitutils.h \
                $(srcdir)/Include/internal/pycore_bytes_methods.h \
                $(srcdir)/Include/internal/pycore_call.h \
                $(srcdir)/Include/internal/pycore_ceval.h \
index 32a2beeb744f7cad829d3332f1c0a90c352262f7..3a9b7119201cf02c6cff16caa377030960795f6d 100644 (file)
@@ -1,5 +1,5 @@
 #include "Python.h"
-#include "pycore_byteswap.h"      // _Py_bswap32()
+#include "pycore_bitutils.h"      // _Py_bswap32()
 
 #include <ffi.h>
 #ifdef MS_WIN32
index 5f217dcb8978e2744389d3a470dc151bfa204a4f..6d5af5917f1f07a93e85545463765b6eec776956 100644 (file)
@@ -12,7 +12,7 @@
 #define PY_SSIZE_T_CLEAN
 
 #include "Python.h"
-#include "pycore_byteswap.h"     // _Py_bswap32()
+#include "pycore_bitutils.h"     // _Py_bswap32()
 #include "pycore_initconfig.h"   // _Py_GetConfigsAsDict()
 #include "pycore_hashtable.h"    // _Py_hashtable_new()
 #include "pycore_gc.h"           // PyGC_Head
@@ -63,6 +63,45 @@ test_bswap(PyObject *self, PyObject *Py_UNUSED(args))
 }
 
 
+static int
+check_popcount(uint32_t x, int expected)
+{
+    // Use volatile to prevent the compiler to optimize out the whole test
+    volatile uint32_t u = x;
+    int bits = _Py_popcount32(u);
+    if (bits != expected) {
+        PyErr_Format(PyExc_AssertionError,
+                     "_Py_popcount32(%lu) returns %i, expected %i",
+                     (unsigned long)x, bits, expected);
+        return -1;
+    }
+    return 0;
+}
+
+
+static PyObject*
+test_popcount(PyObject *self, PyObject *Py_UNUSED(args))
+{
+#define CHECK(X, RESULT) \
+    do { \
+        if (check_popcount(X, RESULT) < 0) { \
+            return NULL; \
+        } \
+    } while (0)
+
+    CHECK(0, 0);
+    CHECK(1, 1);
+    CHECK(0x08080808, 4);
+    CHECK(0x10101010, 4);
+    CHECK(0x10204080, 4);
+    CHECK(0xDEADCAFE, 22);
+    CHECK(0xFFFFFFFF, 32);
+    Py_RETURN_NONE;
+
+#undef CHECK
+}
+
+
 #define TO_PTR(ch) ((void*)(uintptr_t)ch)
 #define FROM_PTR(ptr) ((uintptr_t)ptr)
 #define VALUE(key) (1 + ((int)(key) - 'a'))
@@ -157,6 +196,7 @@ static PyMethodDef TestMethods[] = {
     {"get_configs", get_configs, METH_NOARGS},
     {"get_recursion_depth", get_recursion_depth, METH_NOARGS},
     {"test_bswap", test_bswap, METH_NOARGS},
+    {"test_popcount", test_popcount, METH_NOARGS},
     {"test_hashtable", test_hashtable, METH_NOARGS},
     {NULL, NULL} /* sentinel */
 };
index 8edb1d53828835ee19cb40dc1019760b8c7f3c43..261f9daee280728fbc4ec0dbfc895b874fb94456 100644 (file)
@@ -17,7 +17,7 @@
 /* SHA objects */
 
 #include "Python.h"
-#include "pycore_byteswap.h"      // _Py_bswap32()
+#include "pycore_bitutils.h"      // _Py_bswap32()
 #include "structmember.h"         // PyMemberDef
 #include "hashlib.h"
 #include "pystrhex.h"
index 561ef8ef0e8676185650403b6df8b742163dc2ed..aa2aeedcc6c6499c510a139375f6c4b04cab1213 100644 (file)
@@ -17,7 +17,7 @@
 /* SHA objects */
 
 #include "Python.h"
-#include "pycore_byteswap.h"      // _Py_bswap32()
+#include "pycore_bitutils.h"      // _Py_bswap32()
 #include "structmember.h"         // PyMemberDef
 #include "hashlib.h"
 #include "pystrhex.h"
index 0b209a403c4b769561adee689b3b4be71401c516..ce10c4f66586a18e0dcf0ca5b0fea5dd9c8304bb 100644 (file)
@@ -3,8 +3,9 @@
 /* XXX The functional organization of this file is terrible */
 
 #include "Python.h"
-#include "pycore_interp.h"    // _PY_NSMALLPOSINTS
-#include "pycore_pystate.h"   // _Py_IsMainInterpreter()
+#include "pycore_bitutils.h"      // _Py_popcount32()
+#include "pycore_interp.h"        // _PY_NSMALLPOSINTS
+#include "pycore_pystate.h"       // _Py_IsMainInterpreter()
 #include "longintrepr.h"
 
 #include <float.h>
@@ -5307,12 +5308,10 @@ int_bit_length_impl(PyObject *self)
 static int
 popcount_digit(digit d)
 {
-    /* 32bit SWAR popcount. */
-    uint32_t u = d;
-    u -= (u >> 1) & 0x55555555U;
-    u = (u & 0x33333333U) + ((u >> 2) & 0x33333333U);
-    u = (u + (u >> 4)) & 0x0f0f0f0fU;
-    return (uint32_t)(u * 0x01010101U) >> 24;
+    // digit can be larger than uint32_t, but only PyLong_SHIFT bits
+    // of it will be ever used.
+    Py_BUILD_ASSERT(PyLong_SHIFT <= 32);
+    return _Py_popcount32((uint32_t)d);
 }
 
 /*[clinic input]
index 9b2a29ba3b8c2a739813ddbfd67ef653a9031d9f..197605b012e5c6d9112d7b7be24f62cb386f8073 100644 (file)
@@ -4,7 +4,7 @@
 # error "codecs.h is specific to Unicode"
 #endif
 
-#include "pycore_byteswap.h"      // _Py_bswap32()
+#include "pycore_bitutils.h"      // _Py_bswap32()
 
 /* Mask to quickly check whether a C 'long' contains a
    non-ASCII, UTF8-encoded char. */
index b6b0cf3e991ba722cd0b3a44f5fb8c6da94144ff..8d5f99f8336a3dda57fc48948fe645bb1873d346 100644 (file)
     <ClInclude Include="..\Include\internal\pycore_accu.h" />
     <ClInclude Include="..\Include\internal\pycore_atomic.h" />
     <ClInclude Include="..\Include\internal\pycore_bytes_methods.h" />
-    <ClInclude Include="..\Include\internal\pycore_byteswap.h" />
+    <ClInclude Include="..\Include\internal\pycore_bitutils.h" />
     <ClInclude Include="..\Include\internal\pycore_call.h" />
     <ClInclude Include="..\Include\internal\pycore_ceval.h" />
     <ClInclude Include="..\Include\internal\pycore_code.h" />
index 10dfffba6113e52805a7571b4d451b6be53e1118..7bc9f8f1664569b0669cf87c148986eef5e3a7af 100644 (file)
     <ClInclude Include="..\Include\internal\pycore_atomic.h">
       <Filter>Include</Filter>
     </ClInclude>
-    <ClInclude Include="..\Include\internal\pycore_byteswap.h">
+    <ClInclude Include="..\Include\internal\pycore_bitutils.h">
       <Filter>Include</Filter>
     </ClInclude>
     <ClInclude Include="..\Include\internal\pycore_bytes_methods.h">
index 8801c5ea418c7f5694767760b9799e25caa4fc4d..e272e8808fd956e8e2636116ea02dcddd99d3f1c 100644 (file)
@@ -1,5 +1,6 @@
 #include "Python.h"
 
+#include "pycore_bitutils.h"      // _Py_popcount32
 #include "pycore_hamt.h"
 #include "pycore_object.h"        // _PyObject_GC_TRACK()
 #include <stddef.h>               // offsetof()
@@ -433,30 +434,10 @@ hamt_bitpos(int32_t hash, uint32_t shift)
     return (uint32_t)1 << hamt_mask(hash, shift);
 }
 
-static inline uint32_t
-hamt_bitcount(uint32_t i)
-{
-    /* We could use native popcount instruction but that would
-       require to either add configure flags to enable SSE4.2
-       support or to detect it dynamically.  Otherwise, we have
-       a risk of CPython not working properly on older hardware.
-
-       In practice, there's no observable difference in
-       performance between using a popcount instruction or the
-       following fallback code.
-
-       The algorithm is copied from:
-       https://graphics.stanford.edu/~seander/bithacks.html
-    */
-    i = i - ((i >> 1) & 0x55555555);
-    i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
-    return (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24;
-}
-
 static inline uint32_t
 hamt_bitindex(uint32_t bitmap, uint32_t bit)
 {
-    return hamt_bitcount(bitmap & (bit - 1));
+    return (uint32_t)_Py_popcount32(bitmap & (bit - 1));
 }
 
 
@@ -820,7 +801,7 @@ hamt_node_bitmap_assoc(PyHamtNode_Bitmap *self,
     else {
         /* There was no key before with the same (shift,hash). */
 
-        uint32_t n = hamt_bitcount(self->b_bitmap);
+        uint32_t n = (uint32_t)_Py_popcount32(self->b_bitmap);
 
         if (n >= 16) {
             /* When we have a situation where we want to store more