Add an isunordered check for fast-path and simplified sign check
and use the fmax/fmin when possible.
With gcc-15 on aarch64 for fmaximum_num:
* master
0000000000000000 <__fmaximum_num>:
0:
d503245f bti c
4:
1e612000 fcmp d0, d1
8:
5400008d b.le 18 <__fmaximum_num+0x18>
c:
1e60401f fmov d31, d0
10:
1e6043e0 fmov d0, d31
14:
d65f03c0 ret
18:
54000085 b.pl 28 <__fmaximum_num+0x28> // b.nfrst
1c:
1e60403f fmov d31, d1
20:
1e6043e0 fmov d0, d31
24:
d65f03c0 ret
28:
54000161 b.ne 54 <__fmaximum_num+0x54> // b.any
2c:
4f000402 movi v2.4s, #0x0
30:
1e6e101e fmov d30, #1.
000000000000000000e+00
34:
6ee0f842 fneg v2.2d, v2.2d
38:
4ea21c5d mov v29.16b, v2.16b
3c:
2e7e1c22 bsl v2.8b, v1.8b, v30.8b
40:
2e7e1c1d bsl v29.8b, v0.8b, v30.8b
44:
1e6223b0 fcmpe d29, d2
48:
1e61ac1f fcsel d31, d0, d1, ge // ge = tcont
4c:
1e6043e0 fmov d0, d31
50:
d65f03c0 ret
54:
1e612020 fcmp d1, d1
58:
1e60403f fmov d31, d1
5c:
54ffff87 b.vc 4c <__fmaximum_num+0x4c>
60:
1e602000 fcmp d0, d0
64:
1e60401f fmov d31, d0
68:
54ffff27 b.vc 4c <__fmaximum_num+0x4c>
6c:
1e61281f fadd d31, d0, d1
70:
17fffff7 b 4c <__fmaximum_num+0x4c>
* patch:
0000000000000000 <__fmaximum_num>:
0:
d503245f bti c
4:
1e612000 fcmp d0, d1
8:
54000086 b.vs 18 <__fmaximum_num+0x18>
c:
1e61681f fmaxnm d31, d0, d1
10:
1e6043e0 fmov d0, d31
14:
d65f03c0 ret
18:
1e612020 fcmp d1, d1
1c:
1e60403f fmov d31, d1
20:
54ffff87 b.vc 10 <__fmaximum_num+0x10>
24:
1e602000 fcmp d0, d0
28:
1e60401f fmov d31, d0
2c:
54ffff27 b.vc 10 <__fmaximum_num+0x10>
30:
1e61281f fadd d31, d0, d1
34:
17fffff7 b 10 <__fmaximum_num+0x10>
And with gcc-15 on x86_64:
* master:
0000000000000000 <__fmaximum_num>:
0: 66 0f 2e c1 ucomisd %xmm1,%xmm0
4: 66 0f 28 d8 movapd %xmm0,%xmm3
8: 77 5e ja 68 <__fmaximum_num+0x68>
a: 66 0f 2e c8 ucomisd %xmm0,%xmm1
e: 77 50 ja 60 <__fmaximum_num+0x60>
10: 66 0f 2e c1 ucomisd %xmm1,%xmm0
14: 7a 5a jp 70 <__fmaximum_num+0x70>
16: 75 58 jne 70 <__fmaximum_num+0x70>
18: f3 0f 7e 05 00 00 00 movq 0x0(%rip),%xmm0 # 20 <__fmaximum_num+0x20>
1f: 00
20: f2 0f 10 15 00 00 00 movsd 0x0(%rip),%xmm2 # 28 <__fmaximum_num+0x28>
27: 00
28: 66 0f 28 e0 movapd %xmm0,%xmm4
2c: 66 0f 54 15 00 00 00 andpd 0x0(%rip),%xmm2 # 34 <__fmaximum_num+0x34>
33: 00
34: 66 0f 54 c1 andpd %xmm1,%xmm0
38: 66 0f 54 e3 andpd %xmm3,%xmm4
3c: 66 0f 56 e2 orpd %xmm2,%xmm4
40: 66 0f 56 d0 orpd %xmm0,%xmm2
44: f2 0f c2 d4 02 cmplesd %xmm4,%xmm2
49: 66 0f 54 da andpd %xmm2,%xmm3
4d: 66 0f 55 d1 andnpd %xmm1,%xmm2
51: 66 0f 56 d3 orpd %xmm3,%xmm2
55: 66 0f 28 c2 movapd %xmm2,%xmm0
59: c3 ret
5a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
60: 66 0f 28 c1 movapd %xmm1,%xmm0
64: c3 ret
65: 0f 1f 00 nopl (%rax)
68: c3 ret
69: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
70: 66 0f 2e c9 ucomisd %xmm1,%xmm1
74: 66 0f 28 c1 movapd %xmm1,%xmm0
78: 7b ea jnp 64 <__fmaximum_num+0x64>
7a: 66 0f 2e db ucomisd %xmm3,%xmm3
7e: 66 0f 28 c3 movapd %xmm3,%xmm0
82: 7b e0 jnp 64 <__fmaximum_num+0x64>
84: f2 0f 58 c1 addsd %xmm1,%xmm0
88: c3 ret
* patch:
0000000000000000 <__fmaximum_num>:
0: 66 0f 2e c1 ucomisd %xmm1,%xmm0
4: 7a 2a jp 30 <__fmaximum_num+0x30>
6: 77 18 ja 20 <__fmaximum_num+0x20>
8: 66 0f 2e c8 ucomisd %xmm0,%xmm1
c: 77 08 ja 16 <__fmaximum_num+0x16>
e: 66 0f 50 c0 movmskpd %xmm0,%eax
12: a8 01 test $0x1,%al
14: 74 0a je 20 <__fmaximum_num+0x20>
16: 66 0f 28 c1 movapd %xmm1,%xmm0
1a: c3 ret
1b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
20: 66 0f 28 c8 movapd %xmm0,%xmm1
24: 66 0f 28 c1 movapd %xmm1,%xmm0
28: c3 ret
29: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
30: 66 0f 2e c9 ucomisd %xmm1,%xmm1
34: 7b e0 jnp 16 <__fmaximum_num+0x16>
36: 66 0f 2e c0 ucomisd %xmm0,%xmm0
3a: 7b e4 jnp 20 <__fmaximum_num+0x20>
3c: f2 0f 58 c8 addsd %xmm0,%xmm1
40: eb d4 jmp 16 <__fmaximum_num+0x16>
Checked on x86_64-linux-gnu, aarch64-linux-gnu, i686-linux-gnu,
arm-linux-gnueabihf, powerpc64le-linux-gnu,
riscv64-linux-gnu-rv64imafdc-lp64d, and loongarch64-linux-gnuf64.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>