add C implementation of pdep64()

author Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Thu, 8 Oct 2020 17:50:18 +0000 (20:50 +0300)

committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>

Thu, 8 Oct 2020 17:50:18 +0000 (20:50 +0300)
author Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Thu, 8 Oct 2020 17:50:18 +0000 (20:50 +0300)
committer Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Thu, 8 Oct 2020 17:50:18 +0000 (20:50 +0300)
diff --git a/src/util/arch/common/bitutils.h b/src/util/arch/common/bitutils.h

index f2706d70bfeec22775b48dda2266a05acbb63452..e86b8d44cf2de282eedef1b112b28921c6b6f791 100644 (file)
--- a/src/util/arch/common/bitutils.h
+++ b/src/util/arch/common/bitutils.h
@@ -351,6 +351,36 @@ u64a pext64_impl_c(u64a x, u64a mask) {
      return result;
  }
  
+static really_inline
+u64a pdep64_impl_c(u64a x, u64a _m) {
+    /* Taken from:
+     * https://gcc.gnu.org/legacy-ml/gcc-patches/2017-06/msg01408.html
+     */
+
+    u64a result = 0x0UL;
+    const u64a mask = 0x8000000000000000UL;
+    u64a m = _m;
+    u64a c, t;
+    u64a p;
+
+    /* The pop-count of the mask gives the number of the bits from
+     source to process.  This is also needed to shift bits from the
+     source into the correct position for the result.  */
+    p = 64 - __builtin_popcountl (_m);
+
+    /* The loop is for the number of '1' bits in the mask and clearing
+     each mask bit as it is processed.  */
+    while (m != 0)
+    {
+        c = __builtin_clzl (m);
+        t = x << (p - c);
+        m ^= (mask >> c);
+        result |= (t & (mask >> c));
+        p++;
+    }
+    return (result);
+}
+
  /* compilers don't reliably synthesize the 32-bit ANDN instruction here,
   * so we force its generation.
   */
author	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Thu, 8 Oct 2020 17:50:18 +0000 (20:50 +0300)
committer	Konstantinos Margaritis <konstantinos@vectorcamp.gr>
	Thu, 8 Oct 2020 17:50:18 +0000 (20:50 +0300)