math: Remove UB and optimize double ilogb
The subnormal exponent calculation invokes UB by left shifting the
signed exponent to find the first leading bit. The implementation
also uses 32 bits operations, which generates suboptimal code in
64 bits architectures.
The patch reimplements ilogb using the math_config.h macros and
uses the new stdbit function to simplify the subnormal handling.
On aarch64 it generates better code:
* master:
0000000000000000 <__ieee754_ilogb>:
0:
9e660000 fmov x0, d0
4:
d360fc02 lsr x2, x0, #32
8:
d360f801 ubfx x1, x0, #32, #31
c:
f26c285f tst x2, #0x7ff00000
10:
540001a1 b.ne 44 <__ieee754_ilogb+0x44> // b.any
14:
2a000022 orr w2, w1, w0
18:
34000322 cbz w2, 7c <__ieee754_ilogb+0x7c>
1c:
35000221 cbnz w1, 60 <__ieee754_ilogb+0x60>
20:
2a0003e1 mov w1, w0
24:
7100001f cmp w0, #0x0
28:
12808240 mov w0, #0xfffffbed // #-1043
2c:
540000ad b.le 40 <__ieee754_ilogb+0x40>
30:
531f7821 lsl w1, w1, #1
34:
51000400 sub w0, w0, #0x1
38:
7100003f cmp w1, #0x0
3c:
54ffffac b.gt 30 <__ieee754_ilogb+0x30>
40:
d65f03c0 ret
44:
13147c20 asr w0, w1, #20
48:
12b00202 mov w2, #0x7fefffff // #
2146435071
4c:
510ffc00 sub w0, w0, #0x3ff
50:
6b02003f cmp w1, w2
54:
12b00001 mov w1, #0x7fffffff // #
2147483647
58:
1a819000 csel w0, w0, w1, ls // ls = plast
5c:
d65f03c0 ret
60:
53155021 lsl w1, w1, #11
64:
12807fa0 mov w0, #0xfffffc02 // #-1022
68:
531f7821 lsl w1, w1, #1
6c:
51000400 sub w0, w0, #0x1
70:
7100003f cmp w1, #0x0
74:
54ffffac b.gt 68 <__ieee754_ilogb+0x68>
78:
d65f03c0 ret
7c:
320107e0 mov w0, #0x80000001 // #-
2147483647
80:
d65f03c0 ret
* patch:
0000000000000000 <__ieee754_ilogb>:
0:
9e660001 fmov x1, d0
4:
d374f820 ubfx x0, x1, #52, #11
8:
350000e0 cbnz w0, 24 <__ieee754_ilogb+0x24>
c:
d374cc21 lsl x1, x1, #12
10:
b4000141 cbz x1, 38 <__ieee754_ilogb+0x38>
14:
dac01021 clz x1, x1
18:
12807fc0 mov w0, #0xfffffc01 // #-1023
1c:
4b010000 sub w0, w0, w1
20:
d65f03c0 ret
24:
711ffc1f cmp w0, #0x7ff
28:
510ffc00 sub w0, w0, #0x3ff
2c:
12b00001 mov w1, #0x7fffffff // #
2147483647
30:
1a811000 csel w0, w0, w1, ne // ne = any
34:
d65f03c0 ret
38:
320107e0 mov w0, #0x80000001 // #-
2147483647
3c:
d65f03c0 ret
Other architecture with support for stdc_leading_zeros and/or
__builtin_clzll should have similar improvements.
Checked on aarch64-linux-gnu and x86_64-linux-gnu.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>