--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bit-twiddling primitives (ctz, compress etc)
+ */
+
+#ifndef BITUTILS_ARCH_COMMON_H
+#define BITUTILS_ARCH_COMMON_H
+
+#include "util/popcount.h"
+
+static really_inline
+u32 clz32_impl_c(u32 x) {
+ return (u32)__builtin_clz(x);
+}
+
+static really_inline
+u32 clz64_impl_c(u64a x) {
+ return (u32)__builtin_clzll(x);
+}
+
+// CTZ (count trailing zero) implementations.
+static really_inline
+u32 ctz32_impl_c(u32 x) {
+ return (u32)__builtin_ctz(x);
+}
+
+static really_inline
+u32 ctz64_impl_c(u64a x) {
+ return (u32)__builtin_ctzll(x);
+}
+
+static really_inline
+u32 lg2_impl_c(u32 x) {
+ if (!x) {
+ return 0;
+ }
+ return 31 - clz32_impl_c(x);
+}
+
+static really_inline
+u64a lg2_64_impl_c(u64a x) {
+ if (!x) {
+ return 0;
+ }
+ return 63 - clz64_impl_c(x);
+}
+
+static really_inline
+u32 findAndClearLSB_32_impl_c(u32 *v) {
+ u32 val = *v;
+ u32 offset = ctz32_impl_c(val);
+ *v = val & (val - 1);
+
+ assert(offset < 32);
+ return offset;
+}
+
+static really_inline
+u32 findAndClearLSB_64_impl_c(u64a *v) {
+#ifdef ARCH_64_BIT
+ // generic variant using gcc's builtin on 64-bit
+ u64a val = *v, offset;
+ offset = ctz64_impl_c(val);
+ *v = val & (val - 1);
+#else
+ // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
+ // inline calls to __builtin_ctzll
+ u32 v1 = (u32)*v;
+ u32 v2 = (u32)(*v >> 32);
+ u32 offset;
+ if (v1) {
+ offset = findAndClearLSB_32_impl_c(&v1);
+ *v = (u64a)v1 | ((u64a)v2 << 32);
+ } else {
+ offset = findAndClearLSB_32_impl_c(&v2) + 32;
+ *v = (u64a)v2 << 32;
+ }
+#endif
+
+ assert(offset < 64);
+ return (u32)offset;
+}
+
+static really_inline
+u32 findAndClearMSB_32_impl_c(u32 *v) {
+ u32 val = *v;
+ u32 offset = 31 - clz32_impl_c(val);
+ *v = val & ~(1 << offset);
+
+ assert(offset < 32);
+ return offset;
+}
+
+static really_inline
+u32 findAndClearMSB_64_impl_c(u64a *v) {
+#ifdef ARCH_64_BIT
+ // generic variant using gcc's builtin on 64-bit
+ u64a val = *v, offset;
+ offset = 63 - clz64_impl_c(val);
+ *v = val & ~(1ULL << offset);
+#else
+ // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
+ // inline calls to __builtin_ctzll
+ u32 v1 = (u32)*v;
+ u32 v2 = (*v >> 32);
+ u32 offset;
+ if (v2) {
+ offset = findAndClearMSB_32_impl_c(&v2) + 32;
+ *v = ((u64a)v2 << 32) | (u64a)v1;
+ } else {
+ offset = findAndClearMSB_32_impl_c(&v1);
+ *v = (u64a)v1;
+ }
+#endif
+
+ assert(offset < 64);
+ return (u32)offset;
+}
+
+static really_inline
+u32 compress32_impl_c(u32 x, u32 m) {
+
+ // Return zero quickly on trivial cases
+ if ((x & m) == 0) {
+ return 0;
+ }
+
+ u32 mk, mp, mv, t;
+
+ x &= m; // clear irrelevant bits
+
+ mk = ~m << 1; // we will count 0's to right
+ for (u32 i = 0; i < 5; i++) {
+ mp = mk ^ (mk << 1);
+ mp ^= mp << 2;
+ mp ^= mp << 4;
+ mp ^= mp << 8;
+ mp ^= mp << 16;
+
+ mv = mp & m; // bits to move
+ m = (m ^ mv) | (mv >> (1 << i)); // compress m
+ t = x & mv;
+ x = (x ^ t) | (t >> (1 << i)); // compress x
+ mk = mk & ~mp;
+ }
+
+ return x;
+}
+
+static really_inline
+u64a compress64_impl_c(u64a x, u64a m) {
+ // Return zero quickly on trivial cases
+ if ((x & m) == 0) {
+ return 0;
+ }
+
+ u64a mk, mp, mv, t;
+
+ x &= m; // clear irrelevant bits
+
+ mk = ~m << 1; // we will count 0's to right
+ for (u32 i = 0; i < 6; i++) {
+ mp = mk ^ (mk << 1);
+ mp ^= mp << 2;
+ mp ^= mp << 4;
+ mp ^= mp << 8;
+ mp ^= mp << 16;
+ mp ^= mp << 32;
+
+ mv = mp & m; // bits to move
+ m = (m ^ mv) | (mv >> (1 << i)); // compress m
+ t = x & mv;
+ x = (x ^ t) | (t >> (1 << i)); // compress x
+ mk = mk & ~mp;
+ }
+
+ return x;
+}
+
+static really_inline
+u32 expand32_impl_c(u32 x, u32 m) {
+ // Return zero quickly on trivial cases
+ if (!x || !m) {
+ return 0;
+ }
+
+ u32 m0, mk, mp, mv, t;
+ u32 array[5];
+
+ m0 = m; // save original mask
+ mk = ~m << 1; // we will count 0's to right
+
+ for (int i = 0; i < 5; i++) {
+ mp = mk ^ (mk << 1); // parallel suffix
+ mp = mp ^ (mp << 2);
+ mp = mp ^ (mp << 4);
+ mp = mp ^ (mp << 8);
+ mp = mp ^ (mp << 16);
+ mv = mp & m; // bits to move
+ array[i] = mv;
+ m = (m ^ mv) | (mv >> (1 << i)); // compress m
+ mk = mk & ~mp;
+ }
+
+ for (int i = 4; i >= 0; i--) {
+ mv = array[i];
+ t = x << (1 << i);
+ x = (x & ~mv) | (t & mv);
+ }
+
+ return x & m0; // clear out extraneous bits
+}
+
+static really_inline
+u64a expand64_impl_c(u64a x, u64a m) {
+
+ // Return zero quickly on trivial cases
+ if (!x || !m) {
+ return 0;
+ }
+
+ u64a m0, mk, mp, mv, t;
+ u64a array[6];
+
+ m0 = m; // save original mask
+ mk = ~m << 1; // we will count 0's to right
+
+ for (int i = 0; i < 6; i++) {
+ mp = mk ^ (mk << 1); // parallel suffix
+ mp = mp ^ (mp << 2);
+ mp = mp ^ (mp << 4);
+ mp = mp ^ (mp << 8);
+ mp = mp ^ (mp << 16);
+ mp = mp ^ (mp << 32);
+ mv = mp & m; // bits to move
+ array[i] = mv;
+ m = (m ^ mv) | (mv >> (1 << i)); // compress m
+ mk = mk & ~mp;
+ }
+
+ for (int i = 5; i >= 0; i--) {
+ mv = array[i];
+ t = x << (1 << i);
+ x = (x & ~mv) | (t & mv);
+ }
+
+ return x & m0; // clear out extraneous bits
+}
+
+
+/* returns the first set bit after begin (if not ~0U). If no bit is set after
+ * begin returns ~0U
+ */
+static really_inline
+u32 bf64_iterate_impl_c(u64a bitfield, u32 begin) {
+ if (begin != ~0U) {
+ /* switch off all bits at or below begin. Note: not legal to shift by
+ * by size of the datatype or larger. */
+ assert(begin <= 63);
+ bitfield &= ~((2ULL << begin) - 1);
+ }
+
+ if (!bitfield) {
+ return ~0U;
+ }
+
+ return ctz64_impl_c(bitfield);
+}
+
+static really_inline
+char bf64_set_impl_c(u64a *bitfield, u32 i) {
+ u64a mask = 1ULL << i;
+ char was_set = !!(*bitfield & mask);
+ *bitfield |= mask;
+
+ return was_set;
+}
+
+static really_inline
+void bf64_unset_impl_c(u64a *bitfield, u32 i) {
+ *bitfield &= ~(1ULL << i);
+}
+
+static really_inline
+u32 rank_in_mask32_impl_c(u32 mask, u32 bit) {
+ mask &= (u32)(1U << bit) - 1;
+ return popcount32(mask);
+}
+
+static really_inline
+u32 rank_in_mask64_impl_c(u64a mask, u32 bit) {
+ mask &= (u64a)(1ULL << bit) - 1;
+ return popcount64(mask);
+}
+
+static really_inline
+u32 pext32_impl_c(u32 x, u32 mask) {
+
+ u32 result = 0, num = 1;
+ while (mask != 0) {
+ u32 bit = findAndClearLSB_32_impl_c(&mask);
+ if (x & (1U << bit)) {
+ assert(num != 0); // more than 32 bits!
+ result |= num;
+ }
+ num <<= 1;
+ }
+ return result;
+}
+
+static really_inline
+u64a pext64_impl_c(u64a x, u64a mask) {
+
+ u32 result = 0, num = 1;
+ while (mask != 0) {
+ u32 bit = findAndClearLSB_64_impl_c(&mask);
+ if (x & (1ULL << bit)) {
+ assert(num != 0); // more than 32 bits!
+ result |= num;
+ }
+ num <<= 1;
+ }
+ return result;
+}
+
+#endif // BITUTILS_ARCH_COMMON_H
--- /dev/null
+/*
+ * Copyright (c) 2015-2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/** \file
+ * \brief Bit-twiddling primitives (ctz, compress etc)
+ */
+
+#ifndef BITUTILS_ARCH_X86_H
+#define BITUTILS_ARCH_X86_H
+
+#include "ue2common.h"
+#include "util/popcount.h"
+#include "util/arch.h"
+#include "util/intrinsics.h"
+
+#include "util/arch/common/bitutils.h"
+
+static really_inline
+u32 clz32_impl(u32 x) {
+#if defined(_WIN32)
+ unsigned long r;
+ _BitScanReverse(&r, x);
+ return 31 - r;
+#else
+ return clz32_impl_c(x);
+#endif
+}
+
+static really_inline
+u32 clz64_impl(u64a x) {
+#if defined(_WIN64)
+ unsigned long r;
+ _BitScanReverse64(&r, x);
+ return 63 - r;
+#elif defined(_WIN32)
+ unsigned long x1 = (u32)x;
+ unsigned long x2 = (u32)(x >> 32);
+ unsigned long r;
+ if (x2) {
+ _BitScanReverse(&r, x2);
+ return (u32)(31 - r);
+ }
+ _BitScanReverse(&r, (u32)x1);
+ return (u32)(63 - r);
+#else
+ return clz64_impl_c(x);
+#endif
+}
+
+// CTZ (count trailing zero) implementations.
+static really_inline
+u32 ctz32_impl(u32 x) {
+#if defined(_WIN32)
+ unsigned long r;
+ _BitScanForward(&r, x);
+ return r;
+#else
+ return ctz32_impl_c(x);
+#endif
+}
+
+static really_inline
+u32 ctz64_impl(u64a x) {
+#if defined(_WIN64)
+ unsigned long r;
+ _BitScanForward64(&r, x);
+ return r;
+#elif defined(_WIN32)
+ unsigned long r;
+ if (_BitScanForward(&r, (u32)x)) {
+ return (u32)r;
+ }
+ _BitScanForward(&r, x >> 32);
+ return (u32)(r + 32);
+#else
+ return ctz64_impl_c(x);
+#endif
+}
+
+static really_inline
+u32 lg2_impl(u32 x) {
+ return lg2_impl_c(x);
+}
+
+static really_inline
+u64a lg2_64_impl(u64a x) {
+ return lg2_64_impl_c(x);
+}
+
+static really_inline
+u32 findAndClearLSB_32_impl(u32 *v) {
+#ifndef NO_ASM
+ u32 val = *v, offset;
+ __asm__ ("bsf %1, %0\n"
+ "btr %0, %1\n"
+ : "=r" (offset), "=r" (val)
+ : "1" (val));
+ *v = val;
+
+ assert(offset < 32);
+ return offset;
+#else
+ return findAndClearLSB_32_impl_c(v);
+#endif
+
+}
+
+static really_inline
+u32 findAndClearLSB_64_impl(u64a *v) {
+#ifdef ARCH_64_BIT
+#if !defined(NO_ASM)
+ u64a val = *v, offset;
+ __asm__ ("bsfq %1, %0\n"
+ "btrq %0, %1\n"
+ : "=r" (offset), "=r" (val)
+ : "1" (val));
+ *v = val;
+#else
+ // generic variant using gcc's builtin on 64-bit
+ u64a val = *v, offset;
+ offset = ctz64(val);
+ *v = val & (val - 1);
+#endif // ARCH_X86_64
+ assert(offset < 64);
+ return (u32)offset;
+#else
+ return findAndClearLSB_64_impl_c(v);
+#endif
+}
+
+static really_inline
+u32 findAndClearMSB_32_impl(u32 *v) {
+#if !defined(NO_ASM)
+ u32 val = *v, offset;
+ __asm__ ("bsr %1, %0\n"
+ "btr %0, %1\n"
+ : "=r" (offset), "=r" (val)
+ : "1" (val));
+ *v = val;
+#else
+ u32 val = *v;
+ u32 offset = 31 - clz32_impl(val);
+ *v = val & ~(1 << offset);
+#endif
+ assert(offset < 32);
+ return offset;
+}
+
+static really_inline
+u32 findAndClearMSB_64_impl(u64a *v) {
+#ifdef ARCH_64_BIT
+#if !defined(NO_ASM)
+ u64a val = *v, offset;
+ __asm__ ("bsrq %1, %0\n"
+ "btrq %0, %1\n"
+ : "=r" (offset), "=r" (val)
+ : "1" (val));
+ *v = val;
+#else
+ // generic variant using gcc's builtin on 64-bit
+ u64a val = *v, offset;
+ offset = 63 - clz64_impl(val);
+ *v = val & ~(1ULL << offset);
+#endif // ARCH_X86_64
+ assert(offset < 64);
+ return (u32)offset;
+#else
+ return findAndClearMSB_64_impl_c(v);
+#endif
+}
+
+static really_inline
+u32 compress32_impl(u32 x, u32 m) {
+#if defined(HAVE_BMI2)
+ // BMI2 has a single instruction for this operation.
+ return _pext_u32(x, m);
+#else
+ return compress32_impl_c(x, m);
+#endif
+}
+
+static really_inline
+u64a compress64_impl(u64a x, u64a m) {
+#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
+ // BMI2 has a single instruction for this operation.
+ return _pext_u64(x, m);
+#else
+ return compress64_impl_c(x, m);
+#endif
+}
+
+static really_inline
+u32 expand32_impl(u32 x, u32 m) {
+#if defined(HAVE_BMI2)
+ // BMI2 has a single instruction for this operation.
+ return _pdep_u32(x, m);
+#else
+ return expand32_impl_c(x, m);
+#endif
+}
+
+static really_inline
+u64a expand64_impl(u64a x, u64a m) {
+#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
+ // BMI2 has a single instruction for this operation.
+ return _pdep_u64(x, m);
+#else
+ return expand64_impl_c(x, m);
+#endif
+}
+
+
+/* returns the first set bit after begin (if not ~0U). If no bit is set after
+ * begin returns ~0U
+ */
+static really_inline
+u32 bf64_iterate_impl(u64a bitfield, u32 begin) {
+ if (begin != ~0U) {
+ /* switch off all bits at or below begin. Note: not legal to shift by
+ * by size of the datatype or larger. */
+ assert(begin <= 63);
+ bitfield &= ~((2ULL << begin) - 1);
+ }
+
+ if (!bitfield) {
+ return ~0U;
+ }
+
+ return ctz64_impl(bitfield);
+}
+
+static really_inline
+char bf64_set_impl(u64a *bitfield, u32 i) {
+ return bf64_set_impl_c(bitfield, i);
+}
+
+static really_inline
+void bf64_unset_impl(u64a *bitfield, u32 i) {
+ return bf64_unset_impl_c(bitfield, i);
+}
+
+static really_inline
+u32 rank_in_mask32_impl(u32 mask, u32 bit) {
+ return rank_in_mask32_impl_c(mask, bit);
+}
+
+static really_inline
+u32 rank_in_mask64_impl(u64a mask, u32 bit) {
+ return rank_in_mask64_impl_c(mask, bit);
+}
+
+static really_inline
+u32 pext32_impl(u32 x, u32 mask) {
+#if defined(HAVE_BMI2)
+ // Intel BMI2 can do this operation in one instruction.
+ return _pext_u32(x, mask);
+#else
+ return pext32_impl_c(x, mask);
+#endif
+}
+
+static really_inline
+u64a pext64_impl(u64a x, u64a mask) {
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+ // Intel BMI2 can do this operation in one instruction.
+ return _pext_u64(x, mask);
+#else
+ return pext64_impl_c(x, mask);
+#endif
+}
+
+#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
+static really_inline
+u64a pdep64(u64a x, u64a mask) {
+ return _pdep_u64(x, mask);
+}
+#endif
+
+#endif // BITUTILS_ARCH_X86_H
#ifndef BITUTILS_H
#define BITUTILS_H
+#include "config.h"
#include "ue2common.h"
#include "popcount.h"
#include "util/arch.h"
#define DOUBLE_CASE_CLEAR 0xdfdf
#define OCTO_CASE_CLEAR 0xdfdfdfdfdfdfdfdfULL
+
+#if defined(_WIN32) || defined(_WIN64) || defined(ARCH_IA32) || defined(ARCH_X86_64)
+#include "util/arch/x86/bitutils.h"
+#endif
+
static really_inline
u32 clz32(u32 x) {
assert(x); // behaviour not defined for x == 0
-#if defined(_WIN32)
- unsigned long r;
- _BitScanReverse(&r, x);
- return 31 - r;
-#else
- return (u32)__builtin_clz(x);
-#endif
+
+ return clz32_impl(x);
}
static really_inline
u32 clz64(u64a x) {
assert(x); // behaviour not defined for x == 0
-#if defined(_WIN64)
- unsigned long r;
- _BitScanReverse64(&r, x);
- return 63 - r;
-#elif defined(_WIN32)
- unsigned long x1 = (u32)x;
- unsigned long x2 = (u32)(x >> 32);
- unsigned long r;
- if (x2) {
- _BitScanReverse(&r, x2);
- return (u32)(31 - r);
- }
- _BitScanReverse(&r, (u32)x1);
- return (u32)(63 - r);
-#else
- return (u32)__builtin_clzll(x);
-#endif
+
+ return clz64_impl(x);
}
// CTZ (count trailing zero) implementations.
static really_inline
u32 ctz32(u32 x) {
assert(x); // behaviour not defined for x == 0
-#if defined(_WIN32)
- unsigned long r;
- _BitScanForward(&r, x);
- return r;
-#else
- return (u32)__builtin_ctz(x);
-#endif
+
+ return ctz32_impl(x);
}
static really_inline
u32 ctz64(u64a x) {
assert(x); // behaviour not defined for x == 0
-#if defined(_WIN64)
- unsigned long r;
- _BitScanForward64(&r, x);
- return r;
-#elif defined(_WIN32)
- unsigned long r;
- if (_BitScanForward(&r, (u32)x)) {
- return (u32)r;
- }
- _BitScanForward(&r, x >> 32);
- return (u32)(r + 32);
-#else
- return (u32)__builtin_ctzll(x);
-#endif
+
+ return ctz64_impl(x);
}
static really_inline
u32 lg2(u32 x) {
- if (!x) {
- return 0;
- }
- return 31 - clz32(x);
+ return lg2_impl(x);
}
static really_inline
u64a lg2_64(u64a x) {
- if (!x) {
- return 0;
- }
- return 63 - clz64(x);
+ return lg2_64_impl(x);
}
static really_inline
u32 findAndClearLSB_32(u32 *v) {
- assert(*v != 0); // behaviour not defined in this case
-#ifndef NO_ASM
- u32 val = *v, offset;
- __asm__ ("bsf %1, %0\n"
- "btr %0, %1\n"
- : "=r" (offset), "=r" (val)
- : "1" (val));
- *v = val;
-#else
- u32 val = *v;
- u32 offset = ctz32(val);
- *v = val & (val - 1);
-#endif
-
- assert(offset < 32);
- return offset;
+ return findAndClearLSB_32_impl(v);
}
static really_inline
u32 findAndClearLSB_64(u64a *v) {
- assert(*v != 0); // behaviour not defined in this case
-
-#ifdef ARCH_64_BIT
-#if defined(ARCH_X86_64) && !defined(NO_ASM)
- u64a val = *v, offset;
- __asm__ ("bsfq %1, %0\n"
- "btrq %0, %1\n"
- : "=r" (offset), "=r" (val)
- : "1" (val));
- *v = val;
-#else
- // generic variant using gcc's builtin on 64-bit
- u64a val = *v, offset;
- offset = ctz64(val);
- *v = val & (val - 1);
-#endif // ARCH_X86_64
-#else
- // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
- // inline calls to __builtin_ctzll
- u32 v1 = (u32)*v;
- u32 v2 = (u32)(*v >> 32);
- u32 offset;
- if (v1) {
- offset = findAndClearLSB_32(&v1);
- *v = (u64a)v1 | ((u64a)v2 << 32);
- } else {
- offset = findAndClearLSB_32(&v2) + 32;
- *v = (u64a)v2 << 32;
- }
-#endif
-
- assert(offset < 64);
- return (u32)offset;
+ return findAndClearLSB_64_impl(v);
}
static really_inline
u32 findAndClearMSB_32(u32 *v) {
- assert(*v != 0); // behaviour not defined in this case
-#ifndef NO_ASM
- u32 val = *v, offset;
- __asm__ ("bsr %1, %0\n"
- "btr %0, %1\n"
- : "=r" (offset), "=r" (val)
- : "1" (val));
- *v = val;
-#else
- u32 val = *v;
- u32 offset = 31 - clz32(val);
- *v = val & ~(1 << offset);
-#endif
- assert(offset < 32);
- return offset;
+ return findAndClearMSB_32_impl(v);
}
static really_inline
u32 findAndClearMSB_64(u64a *v) {
- assert(*v != 0); // behaviour not defined in this case
-
-#ifdef ARCH_64_BIT
-#if defined(ARCH_X86_64) && !defined(NO_ASM)
- u64a val = *v, offset;
- __asm__ ("bsrq %1, %0\n"
- "btrq %0, %1\n"
- : "=r" (offset), "=r" (val)
- : "1" (val));
- *v = val;
-#else
- // generic variant using gcc's builtin on 64-bit
- u64a val = *v, offset;
- offset = 63 - clz64(val);
- *v = val & ~(1ULL << offset);
-#endif // ARCH_X86_64
-#else
- // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
- // inline calls to __builtin_ctzll
- u32 v1 = (u32)*v;
- u32 v2 = (*v >> 32);
- u32 offset;
- if (v2) {
- offset = findAndClearMSB_32(&v2) + 32;
- *v = ((u64a)v2 << 32) | (u64a)v1;
- } else {
- offset = findAndClearMSB_32(&v1);
- *v = (u64a)v1;
- }
-#endif
-
- assert(offset < 64);
- return (u32)offset;
+ return findAndClearMSB_64_impl(v);
}
static really_inline
u32 compress32(u32 x, u32 m) {
-#if defined(HAVE_BMI2)
- // BMI2 has a single instruction for this operation.
- return _pext_u32(x, m);
-#else
-
- // Return zero quickly on trivial cases
- if ((x & m) == 0) {
- return 0;
- }
-
- u32 mk, mp, mv, t;
-
- x &= m; // clear irrelevant bits
-
- mk = ~m << 1; // we will count 0's to right
- for (u32 i = 0; i < 5; i++) {
- mp = mk ^ (mk << 1);
- mp ^= mp << 2;
- mp ^= mp << 4;
- mp ^= mp << 8;
- mp ^= mp << 16;
-
- mv = mp & m; // bits to move
- m = (m ^ mv) | (mv >> (1 << i)); // compress m
- t = x & mv;
- x = (x ^ t) | (t >> (1 << i)); // compress x
- mk = mk & ~mp;
- }
-
- return x;
-#endif
+ return compress32_impl(x, m);
}
static really_inline
u64a compress64(u64a x, u64a m) {
-#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
- // BMI2 has a single instruction for this operation.
- return _pext_u64(x, m);
-#else
-
- // Return zero quickly on trivial cases
- if ((x & m) == 0) {
- return 0;
- }
-
- u64a mk, mp, mv, t;
-
- x &= m; // clear irrelevant bits
-
- mk = ~m << 1; // we will count 0's to right
- for (u32 i = 0; i < 6; i++) {
- mp = mk ^ (mk << 1);
- mp ^= mp << 2;
- mp ^= mp << 4;
- mp ^= mp << 8;
- mp ^= mp << 16;
- mp ^= mp << 32;
-
- mv = mp & m; // bits to move
- m = (m ^ mv) | (mv >> (1 << i)); // compress m
- t = x & mv;
- x = (x ^ t) | (t >> (1 << i)); // compress x
- mk = mk & ~mp;
- }
-
- return x;
-#endif
+ return compress64_impl(x, m);
}
static really_inline
u32 expand32(u32 x, u32 m) {
-#if defined(HAVE_BMI2)
- // BMI2 has a single instruction for this operation.
- return _pdep_u32(x, m);
-#else
-
- // Return zero quickly on trivial cases
- if (!x || !m) {
- return 0;
- }
-
- u32 m0, mk, mp, mv, t;
- u32 array[5];
-
- m0 = m; // save original mask
- mk = ~m << 1; // we will count 0's to right
-
- for (int i = 0; i < 5; i++) {
- mp = mk ^ (mk << 1); // parallel suffix
- mp = mp ^ (mp << 2);
- mp = mp ^ (mp << 4);
- mp = mp ^ (mp << 8);
- mp = mp ^ (mp << 16);
- mv = mp & m; // bits to move
- array[i] = mv;
- m = (m ^ mv) | (mv >> (1 << i)); // compress m
- mk = mk & ~mp;
- }
-
- for (int i = 4; i >= 0; i--) {
- mv = array[i];
- t = x << (1 << i);
- x = (x & ~mv) | (t & mv);
- }
-
- return x & m0; // clear out extraneous bits
-#endif
+ return expand32_impl(x, m);
}
static really_inline
u64a expand64(u64a x, u64a m) {
-#if defined(ARCH_X86_64) && defined(HAVE_BMI2)
- // BMI2 has a single instruction for this operation.
- return _pdep_u64(x, m);
-#else
-
- // Return zero quickly on trivial cases
- if (!x || !m) {
- return 0;
- }
-
- u64a m0, mk, mp, mv, t;
- u64a array[6];
-
- m0 = m; // save original mask
- mk = ~m << 1; // we will count 0's to right
-
- for (int i = 0; i < 6; i++) {
- mp = mk ^ (mk << 1); // parallel suffix
- mp = mp ^ (mp << 2);
- mp = mp ^ (mp << 4);
- mp = mp ^ (mp << 8);
- mp = mp ^ (mp << 16);
- mp = mp ^ (mp << 32);
- mv = mp & m; // bits to move
- array[i] = mv;
- m = (m ^ mv) | (mv >> (1 << i)); // compress m
- mk = mk & ~mp;
- }
-
- for (int i = 5; i >= 0; i--) {
- mv = array[i];
- t = x << (1 << i);
- x = (x & ~mv) | (t & mv);
- }
-
- return x & m0; // clear out extraneous bits
-#endif
+ return expand64_impl(x, m);
}
*/
static really_inline
u32 bf64_iterate(u64a bitfield, u32 begin) {
- if (begin != ~0U) {
- /* switch off all bits at or below begin. Note: not legal to shift by
- * by size of the datatype or larger. */
- assert(begin <= 63);
- bitfield &= ~((2ULL << begin) - 1);
- }
-
- if (!bitfield) {
- return ~0U;
- }
-
- return ctz64(bitfield);
+ return bf64_iterate_impl(bitfield, begin);
}
static really_inline
char bf64_set(u64a *bitfield, u32 i) {
- assert(i < 64);
- u64a mask = 1ULL << i;
- char was_set = !!(*bitfield & mask);
- *bitfield |= mask;
-
- return was_set;
+ return bf64_set_impl(bitfield, i);
}
static really_inline
void bf64_unset(u64a *bitfield, u32 i) {
- assert(i < 64);
- *bitfield &= ~(1ULL << i);
+ return bf64_unset_impl(bitfield, i);
}
static really_inline
u32 rank_in_mask32(u32 mask, u32 bit) {
- assert(bit < sizeof(u32) * 8);
- assert(mask & (u32)(1U << bit));
- mask &= (u32)(1U << bit) - 1;
- return popcount32(mask);
+ return rank_in_mask32_impl(mask, bit);
}
static really_inline
u32 rank_in_mask64(u64a mask, u32 bit) {
- assert(bit < sizeof(u64a) * 8);
- assert(mask & (u64a)(1ULL << bit));
- mask &= (u64a)(1ULL << bit) - 1;
- return popcount64(mask);
+ return rank_in_mask64_impl(mask, bit);
}
static really_inline
u32 pext32(u32 x, u32 mask) {
-#if defined(HAVE_BMI2)
- // Intel BMI2 can do this operation in one instruction.
- return _pext_u32(x, mask);
-#else
-
- u32 result = 0, num = 1;
- while (mask != 0) {
- u32 bit = findAndClearLSB_32(&mask);
- if (x & (1U << bit)) {
- assert(num != 0); // more than 32 bits!
- result |= num;
- }
- num <<= 1;
- }
- return result;
-#endif
+ return pext32_impl(x, mask);
}
static really_inline
u64a pext64(u64a x, u64a mask) {
-#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
- // Intel BMI2 can do this operation in one instruction.
- return _pext_u64(x, mask);
-#else
-
- u32 result = 0, num = 1;
- while (mask != 0) {
- u32 bit = findAndClearLSB_64(&mask);
- if (x & (1ULL << bit)) {
- assert(num != 0); // more than 32 bits!
- result |= num;
- }
- num <<= 1;
- }
- return result;
-#endif
+ return pext64_impl(x, mask);
}
-#if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
-static really_inline
-u64a pdep64(u64a x, u64a mask) {
- return _pdep_u64(x, mask);
-}
-#endif
-
#endif // BITUTILS_H