math: Optimize f{max,min}imum_mag{f,l,f128}
Simplify order signal comparison and reorganize check to reduce
branches and allow targets to use conditional select/move.
With gcc-15 on aarch64 for fmaximum_mag:
* master:
0000000000000000 <__fmaximum_mag>:
0:
d503245f bti c
4:
1e60c004 fabs d4, d0
8:
1e60c023 fabs d3, d1
c:
1e632080 fcmp d4, d3
10:
5400008d b.le 20 <__fmaximum_mag+0x20>
14:
1e60401f fmov d31, d0
18:
1e6043e0 fmov d0, d31
1c:
d65f03c0 ret
20:
54000085 b.pl 30 <__fmaximum_mag+0x30> // b.nfrst
24:
1e60403f fmov d31, d1
28:
1e6043e0 fmov d0, d31
2c:
d65f03c0 ret
30:
54000161 b.ne 5c <__fmaximum_mag+0x5c> // b.any
34:
4f000402 movi v2.4s, #0x0
38:
1e6e101d fmov d29, #1.
000000000000000000e+00
3c:
6ee0f842 fneg v2.2d, v2.2d
40:
4ea21c5e mov v30.16b, v2.16b
44:
2e7d1c22 bsl v2.8b, v1.8b, v29.8b
48:
2e7d1c1e bsl v30.8b, v0.8b, v29.8b
4c:
1e6223d0 fcmpe d30, d2
50:
1e61ac1f fcsel d31, d0, d1, ge // ge = tcont
54:
1e6043e0 fmov d0, d31
58:
d65f03c0 ret
5c:
1e61281f fadd d31, d0, d1
60:
1e6043e0 fmov d0, d31
64:
d65f03c0 ret
* patch:
0000000000000000 <__fmaximum_mag>:
0:
d503245f bti c
4:
1e612000 fcmp d0, d1
8:
540000e6 b.vs 24 <__fmaximum_mag+0x24>
c:
1e60c01f fabs d31, d0
10:
1e60c03e fabs d30, d1
14:
1e7e23e0 fcmp d31, d30
18:
540000a0 b.eq 2c <__fmaximum_mag+0x2c> // b.none
1c:
1e60dc20 fcsel d0, d1, d0, le
20:
d65f03c0 ret
24:
1e612800 fadd d0, d0, d1
28:
d65f03c0 ret
2c:
9e660000 fmov x0, d0
30:
f100001f cmp x0, #0x0
34:
1e60bc20 fcsel d0, d1, d0, lt // lt = tstop
38:
d65f03c0 ret
Checked on x86_64-linux-gnu, aarch64-linux-gnu, i686-linux-gnu,
arm-linux-gnueabihf, powerpc64le-linux-gnu,
riscv64-linux-gnu-rv64imafdc-lp64d, and loongarch64-linux-gnuf64.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>