start_sequence ();
rtx op0 = expand_normal (arg);
rtx argm1 = expand_simple_binop (mode, PLUS, op0, constm1_rtx, NULL_RTX,
- 1, OPTAB_DIRECT);
+ 1, OPTAB_WIDEN);
if (argm1 == NULL_RTX)
goto fail;
rtx argxorargm1 = expand_simple_binop (mode, nonzero_arg ? AND : XOR, op0,
- argm1, NULL_RTX, 1, OPTAB_DIRECT);
+ argm1, NULL_RTX, 1, OPTAB_WIDEN);
if (argxorargm1 == NULL_RTX)
goto fail;
rtx cmp;
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand-details" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+/* PR middle-end/116508 */
+
+#pragma GCC target "+nocssc"
+
+/*
+** h16:
+** sub w([0-9]+), w0, #1
+** eor w([0-9]+), w0, w\1
+** and w([0-9]+), w\1, 65535
+** cmp w\3, w\2, uxth
+** cset w0, cc
+** ret
+*/
+
+/* when expanding popcount == 1, should use
+ `(arg ^ (arg - 1)) > arg - 1` as that has a lower latency
+ than doing the popcount then comparing against 1.
+ The popcount/addv can be costly. */
+unsigned h16 (const unsigned short a) {
+ return __builtin_popcountg (a) == 1;
+}
+
+/* unsigned char should also do the same trick */
+/* Currently xfailed since the cost does not take into account the
+ moving between gprs and vector regs correctly. */
+/*
+** h8: { xfail *-*-* }
+** sub w([0-9]+), w0, #1
+** eor w([0-9]+), w0, w\1
+** and w([0-9]+), w\1, 255
+** cmp w\3, w\2, uxtb
+** cset w0, cc
+** ret
+*/
+
+
+unsigned h8 (const unsigned char a) {
+ return __builtin_popcountg (a) == 1;
+}
+
+/* There should be printing out the costs for h8 and h16's popcount == 1 */
+/* { dg-final { scan-rtl-dump-times "popcount == 1:" 2 "expand"} } */