]>
Commit | Line | Data |
---|---|---|
b368d6b8 R |
1 | Analysis of cycle costs for SH4: |
2 | ||
3 | -> udiv_le128: 5 | |
4 | -> udiv_ge64k: 6 | |
5 | -> udiv udiv_25: 10 | |
6 | -> pos_divisor: 3 | |
7 | -> pos_result linear: 5 | |
8 | -> pos_result - -: 5 | |
9 | -> div_le128: 7 | |
10 | -> div_ge64k: 9 | |
11 | sdivsi3 -> udiv_25 13 | |
12 | udiv25 -> div_ge64k_end: 15 | |
13 | div_ge64k_end -> rts: 13 | |
14 | div_le128 -> div_le128_2: 2, r1 latency 3 | |
15 | udiv_le128 -> div_le128_2: 2, r1 latency 3 | |
16 | (u)div_le128 -> div_by_1: 9 | |
17 | (u)div_le128 -> rts: 17 | |
18 | div_by_1(_neg) -> rts: 4 | |
19 | div_ge64k -> div_r8: 2 | |
20 | div_ge64k -> div_ge64k_2: 3 | |
21 | udiv_ge64k -> udiv_r8: 3 | |
22 | udiv_ge64k -> div_ge64k_2: 3 + LS | |
23 | (u)div_ge64k -> div_ge64k_end: 13 | |
24 | div_r8 -> div_r8_2: 2 | |
25 | udiv_r8 -> div_r8_2: 2 + LS | |
26 | (u)div_r8 -> rts: 21 | |
27 | ||
28 | -> - + neg_result: 5 | |
29 | -> + - neg_result: 5 | |
30 | -> div_le128_neg: 7 | |
31 | -> div_ge64k_neg: 9 | |
32 | -> div_r8_neg: 11 | |
33 | -> <64k div_ge64k_neg_end: 28 | |
34 | -> >=64k div_ge64k_neg_end: 22 | |
35 | div_ge64k_neg_end ft -> rts: 14 | |
36 | div_r8_neg_end -> rts: 4 | |
37 | div_r8_neg -> div_r8_neg_end: 18 | |
38 | div_le128_neg -> div_by_1_neg: 4 | |
39 | div_le128_neg -> rts 18 | |
40 | ||
78d310c2 | 41 | sh4-200 absolute divisor range: |
0d52bcc1 | 42 | 1 [2..128] [129..64K) [64K..|dividend|/256] >=64K,>|dividend/256| |
b368d6b8 R |
43 | udiv 18 22 38 32 30 |
44 | sdiv pos: 20 24 41 35 32 | |
45 | sdiv neg: 15 25 42 36 33 | |
46 | ||
78d310c2 R |
47 | sh4-300 absolute divisor range: |
48 | 8 bit 16 bit 24 bit > 24 bit | |
49 | udiv 15 35 28 25 | |
50 | sdiv 14 36 34 31 | |
51 | ||
b368d6b8 R |
52 | |
53 | fp-based: | |
54 | ||
55 | unsigned: 42 + 3 + 3 (lingering ftrc latency + sts fpul,rx) at caller's site | |
56 | signed: 33 + 3 + 3 (lingering ftrc latency + sts fpul,rx) at caller's site | |
57 | ||
58 | call-div1: divisor range: | |
59 | [1..64K) >= 64K | |
60 | unsigned: 63 58 | |
61 | signed: 76 76 | |
62 | ||
63 | SFUNC_STATIC call overhead: | |
64 | mov.l 0f,r1 | |
65 | bsrf r1 | |
66 | ||
67 | SFUNC_GOT call overhead - current: | |
68 | mov.l 0f,r1 | |
69 | mova 0f,r0 | |
70 | mov.l 1f,r2 | |
71 | add r1,r0 | |
72 | mov.l @(r0,r2),r0 | |
73 | jmp @r0 | |
74 | ; 3 cycles worse than SFUNC_STATIC | |
75 | ||
76 | SFUNC_GOT call overhead - improved assembler: | |
77 | mov.l 0f,r1 | |
78 | mova 0f,r0 | |
79 | mov.l @(r0,r1),r0 | |
80 | jmp @r0 | |
81 | ; 2 cycles worse than SFUNC_STATIC |