]>
Commit | Line | Data |
---|---|---|
c9cd7b0c | 1 | /* Optimized memcpy implementation for cached memory on PowerPC64/POWER8. |
2b778ceb | 2 | Copyright (C) 2017-2021 Free Software Foundation, Inc. |
c9cd7b0c AZ |
3 | This file is part of the GNU C Library. |
4 | ||
5 | The GNU C Library is free software; you can redistribute it and/or | |
6 | modify it under the terms of the GNU Lesser General Public | |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
9 | ||
10 | The GNU C Library is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Lesser General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU Lesser General Public | |
16 | License along with the GNU C Library; if not, see | |
5a82c748 | 17 | <https://www.gnu.org/licenses/>. */ |
c9cd7b0c AZ |
18 | |
19 | #include <sysdep.h> | |
20 | ||
21 | ||
22 | /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); | |
23 | Returns 'dst'. */ | |
24 | ||
25 | .machine power8 | |
26 | ENTRY_TOCLESS (__memcpy_power8_cached, 5) | |
27 | CALL_MCOUNT 3 | |
28 | ||
29 | cmpldi cr7,r5,15 | |
30 | bgt cr7,L(ge_16) | |
31 | andi. r9,r5,0x1 | |
32 | mr r9,r3 | |
33 | beq cr0,1f | |
34 | lbz r10,0(r4) | |
35 | addi r9,r3,1 | |
36 | addi r4,r4,1 | |
37 | stb r10,0(r3) | |
38 | 1: | |
39 | andi. r10,r5,0x2 | |
40 | beq cr0,2f | |
41 | lhz r10,0(r4) | |
42 | addi r9,r9,2 | |
43 | addi r4,r4,2 | |
44 | sth r10,-2(r9) | |
45 | 2: | |
46 | andi. r10,r5,0x4 | |
47 | beq cr0,3f | |
48 | lwz r10,0(r4) | |
49 | addi r9,9,4 | |
50 | addi r4,4,4 | |
51 | stw r10,-4(r9) | |
52 | 3: | |
53 | andi. r10,r5,0x8 | |
54 | beqlr cr0 | |
55 | ld r10,0(r4) | |
56 | std r10,0(r9) | |
57 | blr | |
58 | ||
59 | .align 4 | |
60 | L(ge_16): | |
61 | cmpldi cr7,r5,32 | |
62 | ble cr7,L(ge_16_le_32) | |
63 | cmpldi cr7,r5,64 | |
64 | ble cr7,L(gt_32_le_64) | |
65 | ||
66 | /* Align dst to 16 bytes. */ | |
67 | andi. r9,r3,0xf | |
68 | mr r12,r3 | |
69 | beq cr0,L(dst_is_align_16) | |
70 | lxvd2x v0,0,r4 | |
71 | subfic r12,r9,16 | |
72 | subf r5,r12,r5 | |
73 | add r4,r4,r12 | |
74 | add r12,r3,r12 | |
75 | stxvd2x v0,0,r3 | |
76 | L(dst_is_align_16): | |
77 | cmpldi cr7,r5,127 | |
78 | ble cr7,L(tail_copy) | |
79 | mr r9,r12 | |
80 | srdi r10,r5,7 | |
81 | li r11,16 | |
82 | li r6,32 | |
83 | li r7,48 | |
84 | mtctr r10 | |
85 | clrrdi r0,r5,7 | |
86 | ||
87 | /* Main loop, copy 128 bytes each time. */ | |
88 | .align 4 | |
89 | L(copy_128): | |
90 | lxvd2x v10,0,r4 | |
91 | lxvd2x v11,r4,r11 | |
92 | addi r8,r4,64 | |
93 | addi r10,r9,64 | |
94 | lxvd2x v12,r4,r6 | |
95 | lxvd2x v0,r4,r7 | |
96 | addi r4,r4,128 | |
97 | stxvd2x v10,0,r9 | |
98 | stxvd2x v11,r9,r11 | |
99 | stxvd2x v12,r9,r6 | |
100 | stxvd2x v0,r9,r7 | |
101 | addi r9,r9,128 | |
102 | lxvd2x v10,0,r8 | |
103 | lxvd2x v11,r8,r11 | |
104 | lxvd2x v12,r8,r6 | |
105 | lxvd2x v0,r8,r7 | |
106 | stxvd2x v10,0,r10 | |
107 | stxvd2x v11,r10,r11 | |
108 | stxvd2x v12,r10,r6 | |
109 | stxvd2x v0,r10,r7 | |
110 | bdnz L(copy_128) | |
111 | ||
112 | add r12,r12,r0 | |
113 | rldicl r5,r5,0,57 | |
114 | L(tail_copy): | |
115 | cmpldi cr7,r5,63 | |
116 | ble cr7,L(tail_le_64) | |
117 | li r8,16 | |
118 | li r10,32 | |
119 | lxvd2x v10,0,r4 | |
120 | li r9,48 | |
121 | addi r5,r5,-64 | |
122 | lxvd2x v11,r4,r8 | |
123 | lxvd2x v12,r4,r10 | |
124 | lxvd2x v0,r4,r9 | |
125 | addi r4,r4,64 | |
126 | stxvd2x v10,0,r12 | |
127 | stxvd2x v11,r12,r8 | |
128 | stxvd2x v12,r12,r10 | |
129 | stxvd2x v0,r12,9 | |
130 | addi r12,r12,64 | |
131 | ||
132 | L(tail_le_64): | |
133 | cmpldi cr7,r5,32 | |
134 | bgt cr7,L(tail_gt_32_le_64) | |
135 | cmpdi cr7,r5,0 | |
136 | beqlr cr7 | |
137 | addi r5,r5,-32 | |
138 | li r9,16 | |
139 | add r8,r4,r5 | |
140 | add r10,r12,r5 | |
141 | lxvd2x v12,r4,r5 | |
142 | lxvd2x v0,r8,r9 | |
143 | stxvd2x v12,r12,r5 | |
144 | stxvd2x v0,r10,r9 | |
145 | blr | |
146 | ||
147 | .align 4 | |
148 | L(ge_16_le_32): | |
149 | addi r5,r5,-16 | |
150 | lxvd2x v0,0,r4 | |
151 | lxvd2x v1,r4,r5 | |
152 | stxvd2x v0,0,r3 | |
153 | stxvd2x v1,r3,r5 | |
154 | blr | |
155 | ||
156 | .align 4 | |
157 | L(gt_32_le_64): | |
158 | mr r12,r3 | |
159 | ||
160 | .align 4 | |
161 | L(tail_gt_32_le_64): | |
162 | li r9,16 | |
163 | lxvd2x v0,0,r4 | |
164 | addi r5,r5,-32 | |
165 | lxvd2x v1,r4,r9 | |
166 | add r8,r4,r5 | |
167 | lxvd2x v2,r4,r5 | |
168 | add r10,r12,r5 | |
169 | lxvd2x v3,r8,r9 | |
170 | stxvd2x v0,0,r12 | |
171 | stxvd2x v1,r12,r9 | |
172 | stxvd2x v2,r12,r5 | |
173 | stxvd2x v3,r10,r9 | |
174 | blr | |
175 | ||
176 | END_GEN_TB (__memcpy_power8_cached,TB_TOCLESS) |