Add an isunordered check for fast-path and simplified sign check
and use the fmax/fmin when possible.
With gcc-15 on aarch64:
* master:
0000000000000000 <__fmaximum>:
0:
d503245f bti c
4:
1e612000 fcmp d0, d1
8:
5400008d b.le 18 <__fmaximum+0x18>
c:
1e60401f fmov d31, d0
10:
1e6043e0 fmov d0, d31
14:
d65f03c0 ret
18:
54000085 b.pl 28 <__fmaximum+0x28> // b.nfrst
1c:
1e60403f fmov d31, d1
20:
1e6043e0 fmov d0, d31
24:
d65f03c0 ret
28:
54000161 b.ne 54 <__fmaximum+0x54> // b.any
2c:
4f000402 movi v2.4s, #0x0
30:
1e6e101e fmov d30, #1.
000000000000000000e+00
34:
6ee0f842 fneg v2.2d, v2.2d
38:
4ea21c5d mov v29.16b, v2.16b
3c:
2e7e1c22 bsl v2.8b, v1.8b, v30.8b
40:
2e7e1c1d bsl v29.8b, v0.8b, v30.8b
44:
1e6223b0 fcmpe d29, d2
48:
1e61ac1f fcsel d31, d0, d1, ge // ge = tcont
4c:
1e6043e0 fmov d0, d31
50:
d65f03c0 ret
54:
1e61281f fadd d31, d0, d1
58:
1e6043e0 fmov d0, d31
5c:
d65f03c0 ret
* patch:
0000000000000000 <__fmaximum>:
0:
d503245f bti c
4:
1e612000 fcmp d0, d1
8:
54000086 b.vs 18 <__fmaximum+0x18>
c:
1e61681f fmaxnm d31, d0, d1
10:
1e6043e0 fmov d0, d31
14:
d65f03c0 ret
18:
1e61281f fadd d31, d0, d1
1c:
1e6043e0 fmov d0, d31
20:
d65f03c0 ret
And with gcc-15 on x86_64:
* master:
0000000000000000 <__fmaximum>:
0: 66 0f 2e c1 ucomisd %xmm1,%xmm0
4: 77 56 ja 5c <__fmaximum+0x5c>
6: 66 0f 2e c8 ucomisd %xmm0,%xmm1
a: 77 4c ja 58 <__fmaximum+0x58>
c: 66 0f 2e c1 ucomisd %xmm1,%xmm0
10: 7a 4e jp 60 <__fmaximum+0x60>
12: 75 4c jne 60 <__fmaximum+0x60>
14: f3 0f 7e 1d 00 00 00 movq 0x0(%rip),%xmm3 # 1c <__fmaximum+0x1c>
1b: 00
1c: f2 0f 10 15 00 00 00 movsd 0x0(%rip),%xmm2 # 24 <__fmaximum+0x24>
23: 00
24: 66 0f 28 e3 movapd %xmm3,%xmm4
28: 66 0f 54 15 00 00 00 andpd 0x0(%rip),%xmm2 # 30 <__fmaximum+0x30>
2f: 00
30: 66 0f 54 e0 andpd %xmm0,%xmm4
34: 66 0f 54 d9 andpd %xmm1,%xmm3
38: 66 0f 56 e2 orpd %xmm2,%xmm4
3c: 66 0f 56 d3 orpd %xmm3,%xmm2
40: f2 0f c2 d4 02 cmplesd %xmm4,%xmm2
45: 66 0f 54 c2 andpd %xmm2,%xmm0
49: 66 0f 55 d1 andnpd %xmm1,%xmm2
4d: 66 0f 56 c2 orpd %xmm2,%xmm0
51: c3 ret
52: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
58: 66 0f 28 c1 movapd %xmm1,%xmm0
5c: c3 ret
5d: 0f 1f 00 nopl (%rax)
60: f2 0f 58 c1 addsd %xmm1,%xmm0
64: c3 ret
* patched:
0000000000000000 <__fmaximum>:
0: 66 0f 2e c1 ucomisd %xmm1,%xmm0
4: 7a 2a jp 30 <__fmaximum+0x30>
6: 77 18 ja 20 <__fmaximum+0x20>
8: 66 0f 2e c8 ucomisd %xmm0,%xmm1
c: 77 08 ja 16 <__fmaximum+0x16>
e: 66 0f 50 c0 movmskpd %xmm0,%eax
12: a8 01 test $0x1,%al
14: 74 0a je 20 <__fmaximum+0x20>
16: 66 0f 28 c1 movapd %xmm1,%xmm0
1a: c3 ret
1b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
20: 66 0f 28 c8 movapd %xmm0,%xmm1
24: 66 0f 28 c1 movapd %xmm1,%xmm0
28: c3 ret
29: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
30: f2 0f 58 c8 addsd %xmm0,%xmm1
34: 66 0f 28 c1 movapd %xmm1,%xmm0
38: c3 ret
Checked on x86_64-linux-gnu, aarch64-linux-gnu, i686-linux-gnu,
arm-linux-gnueabihf, powerpc64le-linux-gnu,
riscv64-linux-gnu-rv64imafdc-lp64d, and loongarch64-linux-gnuf64.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>