]>
Commit | Line | Data |
---|---|---|
8845cb37 AS |
1 | /* Subroutines used to expand string and block move, clear, |
2 | compare and other operations for PowerPC. | |
a945c346 | 3 | Copyright (C) 1991-2024 Free Software Foundation, Inc. |
8845cb37 AS |
4 | |
5 | This file is part of GCC. | |
6 | ||
7 | GCC is free software; you can redistribute it and/or modify it | |
8 | under the terms of the GNU General Public License as published | |
9 | by the Free Software Foundation; either version 3, or (at your | |
10 | option) any later version. | |
11 | ||
12 | GCC is distributed in the hope that it will be useful, but WITHOUT | |
13 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
14 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | |
15 | License for more details. | |
16 | ||
17 | You should have received a copy of the GNU General Public License | |
18 | along with GCC; see the file COPYING3. If not see | |
19 | <http://www.gnu.org/licenses/>. */ | |
20 | ||
8fcc61f8 RS |
21 | #define IN_TARGET_CODE 1 |
22 | ||
8845cb37 AS |
23 | #include "config.h" |
24 | #include "system.h" | |
25 | #include "coretypes.h" | |
26 | #include "backend.h" | |
27 | #include "rtl.h" | |
28 | #include "tree.h" | |
29 | #include "memmodel.h" | |
30 | #include "tm_p.h" | |
31 | #include "ira.h" | |
32 | #include "print-tree.h" | |
33 | #include "varasm.h" | |
34 | #include "explow.h" | |
35 | #include "expr.h" | |
36 | #include "output.h" | |
e0bd6c9f | 37 | #include "target.h" |
faaeebd6 AS |
38 | #include "profile-count.h" |
39 | #include "predict.h" | |
8845cb37 AS |
40 | |
41 | /* Expand a block clear operation, and return 1 if successful. Return 0 | |
42 | if we should let the compiler generate normal code. | |
43 | ||
44 | operands[0] is the destination | |
45 | operands[1] is the length | |
46 | operands[3] is the alignment */ | |
47 | ||
48 | int | |
49 | expand_block_clear (rtx operands[]) | |
50 | { | |
51 | rtx orig_dest = operands[0]; | |
52 | rtx bytes_rtx = operands[1]; | |
53 | rtx align_rtx = operands[3]; | |
2e42a52f | 54 | bool constp = CONST_INT_P (bytes_rtx); |
8845cb37 AS |
55 | HOST_WIDE_INT align; |
56 | HOST_WIDE_INT bytes; | |
57 | int offset; | |
58 | int clear_bytes; | |
59 | int clear_step; | |
60 | ||
61 | /* If this is not a fixed size move, just call memcpy */ | |
62 | if (! constp) | |
63 | return 0; | |
64 | ||
65 | /* This must be a fixed size alignment */ | |
2e42a52f | 66 | gcc_assert (CONST_INT_P (align_rtx)); |
8845cb37 AS |
67 | align = INTVAL (align_rtx) * BITS_PER_UNIT; |
68 | ||
69 | /* Anything to clear? */ | |
70 | bytes = INTVAL (bytes_rtx); | |
71 | if (bytes <= 0) | |
72 | return 1; | |
73 | ||
74 | /* Use the builtin memset after a point, to avoid huge code bloat. | |
75 | When optimize_size, avoid any significant code bloat; calling | |
76 | memset is about 4 instructions, so allow for one instruction to | |
77 | load zero and three to do clearing. */ | |
3b0cb1a5 | 78 | if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX)) |
8845cb37 AS |
79 | clear_step = 16; |
80 | else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT)) | |
81 | clear_step = 8; | |
82 | else | |
83 | clear_step = 4; | |
84 | ||
85 | if (optimize_size && bytes > 3 * clear_step) | |
86 | return 0; | |
87 | if (! optimize_size && bytes > 8 * clear_step) | |
88 | return 0; | |
89 | ||
645eee74 AS |
90 | bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX); |
91 | ||
8845cb37 AS |
92 | for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes) |
93 | { | |
94 | machine_mode mode = BLKmode; | |
95 | rtx dest; | |
96 | ||
31369f5a | 97 | if (TARGET_ALTIVEC |
645eee74 | 98 | && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok))) |
8845cb37 AS |
99 | { |
100 | clear_bytes = 16; | |
101 | mode = V4SImode; | |
102 | } | |
103 | else if (bytes >= 8 && TARGET_POWERPC64 | |
104 | && (align >= 64 || !STRICT_ALIGNMENT)) | |
105 | { | |
106 | clear_bytes = 8; | |
107 | mode = DImode; | |
108 | if (offset == 0 && align < 64) | |
109 | { | |
110 | rtx addr; | |
111 | ||
112 | /* If the address form is reg+offset with offset not a | |
113 | multiple of four, reload into reg indirect form here | |
114 | rather than waiting for reload. This way we get one | |
115 | reload, not one per store. */ | |
116 | addr = XEXP (orig_dest, 0); | |
117 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
2e42a52f | 118 | && CONST_INT_P (XEXP (addr, 1)) |
8845cb37 AS |
119 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) |
120 | { | |
121 | addr = copy_addr_to_reg (addr); | |
122 | orig_dest = replace_equiv_address (orig_dest, addr); | |
123 | } | |
124 | } | |
125 | } | |
126 | else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) | |
127 | { /* move 4 bytes */ | |
128 | clear_bytes = 4; | |
129 | mode = SImode; | |
130 | } | |
131 | else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) | |
132 | { /* move 2 bytes */ | |
133 | clear_bytes = 2; | |
134 | mode = HImode; | |
135 | } | |
136 | else /* move 1 byte at a time */ | |
137 | { | |
138 | clear_bytes = 1; | |
139 | mode = QImode; | |
140 | } | |
141 | ||
142 | dest = adjust_address (orig_dest, mode, offset); | |
143 | ||
144 | emit_move_insn (dest, CONST0_RTX (mode)); | |
145 | } | |
146 | ||
147 | return 1; | |
148 | } | |
149 | ||
150 | /* Figure out the correct instructions to generate to load data for | |
151 | block compare. MODE is used for the read from memory, and | |
152 | data is zero extended if REG is wider than MODE. If LE code | |
153 | is being generated, bswap loads are used. | |
154 | ||
155 | REG is the destination register to move the data into. | |
156 | MEM is the memory block being read. | |
157 | MODE is the mode of memory to use for the read. */ | |
158 | static void | |
159 | do_load_for_compare (rtx reg, rtx mem, machine_mode mode) | |
160 | { | |
161 | switch (GET_MODE (reg)) | |
162 | { | |
9d36bd3b AS |
163 | case E_V16QImode: |
164 | switch (mode) | |
165 | { | |
166 | case E_V16QImode: | |
167 | if (!BYTES_BIG_ENDIAN) | |
168 | { | |
169 | if (TARGET_P9_VECTOR) | |
170 | emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem)); | |
171 | else | |
172 | { | |
173 | rtx reg_v2di = simplify_gen_subreg (V2DImode, reg, | |
174 | V16QImode, 0); | |
175 | gcc_assert (MEM_P (mem)); | |
176 | rtx addr = XEXP (mem, 0); | |
177 | rtx mem_v2di = gen_rtx_MEM (V2DImode, addr); | |
178 | MEM_COPY_ATTRIBUTES (mem_v2di, mem); | |
179 | set_mem_size (mem, GET_MODE_SIZE (V2DImode)); | |
180 | emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di)); | |
181 | } | |
182 | } | |
183 | else | |
184 | emit_insn (gen_vsx_movv2di_64bit (reg, mem)); | |
185 | break; | |
186 | default: | |
187 | gcc_unreachable (); | |
188 | } | |
189 | break; | |
4e10a5a7 | 190 | case E_DImode: |
8845cb37 AS |
191 | switch (mode) |
192 | { | |
4e10a5a7 | 193 | case E_QImode: |
8845cb37 AS |
194 | emit_insn (gen_zero_extendqidi2 (reg, mem)); |
195 | break; | |
4e10a5a7 | 196 | case E_HImode: |
8845cb37 AS |
197 | { |
198 | rtx src = mem; | |
199 | if (!BYTES_BIG_ENDIAN) | |
200 | { | |
201 | src = gen_reg_rtx (HImode); | |
202 | emit_insn (gen_bswaphi2 (src, mem)); | |
203 | } | |
204 | emit_insn (gen_zero_extendhidi2 (reg, src)); | |
205 | break; | |
206 | } | |
4e10a5a7 | 207 | case E_SImode: |
8845cb37 AS |
208 | { |
209 | rtx src = mem; | |
210 | if (!BYTES_BIG_ENDIAN) | |
211 | { | |
212 | src = gen_reg_rtx (SImode); | |
213 | emit_insn (gen_bswapsi2 (src, mem)); | |
214 | } | |
215 | emit_insn (gen_zero_extendsidi2 (reg, src)); | |
216 | } | |
217 | break; | |
4e10a5a7 | 218 | case E_DImode: |
8845cb37 AS |
219 | if (!BYTES_BIG_ENDIAN) |
220 | emit_insn (gen_bswapdi2 (reg, mem)); | |
221 | else | |
222 | emit_insn (gen_movdi (reg, mem)); | |
223 | break; | |
224 | default: | |
225 | gcc_unreachable (); | |
226 | } | |
227 | break; | |
228 | ||
4e10a5a7 | 229 | case E_SImode: |
8845cb37 AS |
230 | switch (mode) |
231 | { | |
4e10a5a7 | 232 | case E_QImode: |
8845cb37 AS |
233 | emit_insn (gen_zero_extendqisi2 (reg, mem)); |
234 | break; | |
4e10a5a7 | 235 | case E_HImode: |
8845cb37 AS |
236 | { |
237 | rtx src = mem; | |
238 | if (!BYTES_BIG_ENDIAN) | |
239 | { | |
240 | src = gen_reg_rtx (HImode); | |
241 | emit_insn (gen_bswaphi2 (src, mem)); | |
242 | } | |
243 | emit_insn (gen_zero_extendhisi2 (reg, src)); | |
244 | break; | |
245 | } | |
4e10a5a7 | 246 | case E_SImode: |
8845cb37 AS |
247 | if (!BYTES_BIG_ENDIAN) |
248 | emit_insn (gen_bswapsi2 (reg, mem)); | |
249 | else | |
250 | emit_insn (gen_movsi (reg, mem)); | |
251 | break; | |
4e10a5a7 | 252 | case E_DImode: |
8845cb37 AS |
253 | /* DImode is larger than the destination reg so is not expected. */ |
254 | gcc_unreachable (); | |
255 | break; | |
256 | default: | |
257 | gcc_unreachable (); | |
258 | } | |
259 | break; | |
9d36bd3b AS |
260 | |
261 | case E_QImode: | |
262 | gcc_assert (mode == E_QImode); | |
263 | emit_move_insn (reg, mem); | |
264 | break; | |
ef4adf1f | 265 | |
8845cb37 AS |
266 | default: |
267 | gcc_unreachable (); | |
268 | break; | |
269 | } | |
270 | } | |
271 | ||
272 | /* Select the mode to be used for reading the next chunk of bytes | |
273 | in the compare. | |
274 | ||
275 | OFFSET is the current read offset from the beginning of the block. | |
276 | BYTES is the number of bytes remaining to be read. | |
74f9986e | 277 | ALIGN is the minimum alignment of the memory blocks being compared in bytes. */ |
8845cb37 AS |
278 | static machine_mode |
279 | select_block_compare_mode (unsigned HOST_WIDE_INT offset, | |
280 | unsigned HOST_WIDE_INT bytes, | |
74f9986e | 281 | unsigned HOST_WIDE_INT align) |
8845cb37 AS |
282 | { |
283 | /* First see if we can do a whole load unit | |
284 | as that will be more efficient than a larger load + shift. */ | |
285 | ||
286 | /* If big, use biggest chunk. | |
287 | If exactly chunk size, use that size. | |
288 | If remainder can be done in one piece with shifting, do that. | |
289 | Do largest chunk possible without violating alignment rules. */ | |
290 | ||
291 | /* The most we can read without potential page crossing. */ | |
292 | unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align); | |
293 | ||
74f9986e AS |
294 | /* If we have an LE target without ldbrx and word_mode is DImode, |
295 | then we must avoid using word_mode. */ | |
296 | int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX | |
297 | && word_mode == DImode); | |
298 | ||
8845cb37 AS |
299 | if (word_mode_ok && bytes >= UNITS_PER_WORD) |
300 | return word_mode; | |
301 | else if (bytes == GET_MODE_SIZE (SImode)) | |
302 | return SImode; | |
303 | else if (bytes == GET_MODE_SIZE (HImode)) | |
304 | return HImode; | |
305 | else if (bytes == GET_MODE_SIZE (QImode)) | |
306 | return QImode; | |
307 | else if (bytes < GET_MODE_SIZE (SImode) | |
78bd9e25 | 308 | && !targetm.slow_unaligned_access (SImode, align * BITS_PER_UNIT) |
8845cb37 AS |
309 | && offset >= GET_MODE_SIZE (SImode) - bytes) |
310 | /* This matches the case were we have SImode and 3 bytes | |
311 | and offset >= 1 and permits us to move back one and overlap | |
312 | with the previous read, thus avoiding having to shift | |
313 | unwanted bytes off of the input. */ | |
314 | return SImode; | |
315 | else if (word_mode_ok && bytes < UNITS_PER_WORD | |
78bd9e25 | 316 | && !targetm.slow_unaligned_access (word_mode, align * BITS_PER_UNIT) |
8845cb37 AS |
317 | && offset >= UNITS_PER_WORD-bytes) |
318 | /* Similarly, if we can use DImode it will get matched here and | |
319 | can do an overlapping read that ends at the end of the block. */ | |
320 | return word_mode; | |
321 | else if (word_mode_ok && maxread >= UNITS_PER_WORD) | |
322 | /* It is safe to do all remaining in one load of largest size, | |
323 | possibly with a shift to get rid of unwanted bytes. */ | |
324 | return word_mode; | |
325 | else if (maxread >= GET_MODE_SIZE (SImode)) | |
326 | /* It is safe to do all remaining in one SImode load, | |
327 | possibly with a shift to get rid of unwanted bytes. */ | |
328 | return SImode; | |
329 | else if (bytes > GET_MODE_SIZE (SImode)) | |
330 | return SImode; | |
331 | else if (bytes > GET_MODE_SIZE (HImode)) | |
332 | return HImode; | |
333 | ||
334 | /* final fallback is do one byte */ | |
335 | return QImode; | |
336 | } | |
337 | ||
338 | /* Compute the alignment of pointer+OFFSET where the original alignment | |
339 | of pointer was BASE_ALIGN. */ | |
340 | static unsigned HOST_WIDE_INT | |
341 | compute_current_alignment (unsigned HOST_WIDE_INT base_align, | |
342 | unsigned HOST_WIDE_INT offset) | |
343 | { | |
344 | if (offset == 0) | |
345 | return base_align; | |
346 | return MIN (base_align, offset & -offset); | |
347 | } | |
348 | ||
5ec3397e AS |
349 | /* Prepare address and then do a load. |
350 | ||
351 | MODE is the mode to use for the load. | |
352 | DEST is the destination register for the data. | |
353 | ADDR is the address to be loaded. | |
354 | ORIG_ADDR is the original address expression. */ | |
355 | static void | |
356 | do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr, | |
357 | rtx orig_addr) | |
358 | { | |
359 | rtx mem = gen_rtx_MEM (mode, addr); | |
360 | MEM_COPY_ATTRIBUTES (mem, orig_addr); | |
361 | set_mem_size (mem, GET_MODE_SIZE (mode)); | |
362 | do_load_for_compare (dest, mem, mode); | |
363 | return; | |
364 | } | |
365 | ||
366 | /* Do a branch for an if/else decision. | |
367 | ||
368 | CMPMODE is the mode to use for the comparison. | |
369 | COMPARISON is the rtx code for the compare needed. | |
370 | A is the first thing to be compared. | |
371 | B is the second thing to be compared. | |
372 | CR is the condition code reg input, or NULL_RTX. | |
373 | TRUE_LABEL is the label to branch to if the condition is true. | |
faaeebd6 | 374 | P is the estimated branch probability for the branch. |
5ec3397e AS |
375 | |
376 | The return value is the CR used for the comparison. | |
377 | If CR is null_rtx, then a new register of CMPMODE is generated. | |
378 | If A and B are both null_rtx, then CR must not be null, and the | |
379 | compare is not generated so you can use this with a dot form insn. */ | |
380 | ||
381 | static void | |
382 | do_ifelse (machine_mode cmpmode, rtx_code comparison, | |
faaeebd6 | 383 | rtx a, rtx b, rtx cr, rtx true_label, profile_probability br_prob) |
5ec3397e AS |
384 | { |
385 | gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX) | |
386 | || (a != NULL_RTX && b != NULL_RTX)); | |
387 | ||
388 | if (cr != NULL_RTX) | |
389 | gcc_assert (GET_MODE (cr) == cmpmode); | |
390 | else | |
391 | cr = gen_reg_rtx (cmpmode); | |
392 | ||
393 | rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label); | |
394 | ||
395 | if (a != NULL_RTX) | |
396 | emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b)); | |
397 | ||
398 | rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx); | |
399 | ||
400 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx); | |
faaeebd6 AS |
401 | rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); |
402 | add_reg_br_prob_note (j, br_prob); | |
5ec3397e AS |
403 | JUMP_LABEL (j) = true_label; |
404 | LABEL_NUSES (true_label) += 1; | |
405 | } | |
406 | ||
407 | /* Emit an isel of the proper mode for DEST. | |
408 | ||
409 | DEST is the isel destination register. | |
410 | SRC1 is the isel source if CR is true. | |
411 | SRC2 is the isel source if CR is false. | |
412 | CR is the condition for the isel. */ | |
413 | static void | |
414 | do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr) | |
415 | { | |
416 | if (GET_MODE (dest) == DImode) | |
4ba3902e | 417 | emit_insn (gen_isel_cc_di (dest, cmp, src_t, src_f, cr)); |
5ec3397e | 418 | else |
4ba3902e | 419 | emit_insn (gen_isel_cc_si (dest, cmp, src_t, src_f, cr)); |
5ec3397e AS |
420 | } |
421 | ||
422 | /* Emit a subtract of the proper mode for DEST. | |
423 | ||
424 | DEST is the destination register for the subtract. | |
425 | SRC1 is the first subtract input. | |
426 | SRC2 is the second subtract input. | |
427 | ||
428 | Computes DEST = SRC1-SRC2. */ | |
429 | static void | |
430 | do_sub3 (rtx dest, rtx src1, rtx src2) | |
431 | { | |
432 | if (GET_MODE (dest) == DImode) | |
433 | emit_insn (gen_subdi3 (dest, src1, src2)); | |
434 | else | |
435 | emit_insn (gen_subsi3 (dest, src1, src2)); | |
436 | } | |
437 | ||
438 | /* Emit an add of the proper mode for DEST. | |
439 | ||
440 | DEST is the destination register for the add. | |
441 | SRC1 is the first add input. | |
442 | SRC2 is the second add input. | |
443 | ||
444 | Computes DEST = SRC1+SRC2. */ | |
445 | static void | |
446 | do_add3 (rtx dest, rtx src1, rtx src2) | |
447 | { | |
448 | if (GET_MODE (dest) == DImode) | |
449 | emit_insn (gen_adddi3 (dest, src1, src2)); | |
450 | else | |
451 | emit_insn (gen_addsi3 (dest, src1, src2)); | |
452 | } | |
453 | ||
f7e94dfb AS |
454 | /* Emit an and of the proper mode for DEST. |
455 | ||
456 | DEST is the destination register for the and. | |
457 | SRC1 is the first and input. | |
458 | SRC2 is the second and input. | |
459 | ||
460 | Computes DEST = SRC1&SRC2. */ | |
461 | static void | |
462 | do_and3 (rtx dest, rtx src1, rtx src2) | |
463 | { | |
464 | if (GET_MODE (dest) == DImode) | |
465 | emit_insn (gen_anddi3 (dest, src1, src2)); | |
466 | else | |
467 | emit_insn (gen_andsi3 (dest, src1, src2)); | |
468 | } | |
469 | ||
470 | /* Emit an cmpb of the proper mode for DEST. | |
471 | ||
472 | DEST is the destination register for the cmpb. | |
473 | SRC1 is the first input. | |
474 | SRC2 is the second input. | |
475 | ||
476 | Computes cmpb of SRC1, SRC2. */ | |
477 | static void | |
478 | do_cmpb3 (rtx dest, rtx src1, rtx src2) | |
479 | { | |
480 | if (GET_MODE (dest) == DImode) | |
481 | emit_insn (gen_cmpbdi3 (dest, src1, src2)); | |
482 | else | |
483 | emit_insn (gen_cmpbsi3 (dest, src1, src2)); | |
484 | } | |
485 | ||
486 | /* Emit a rotl of the proper mode for DEST. | |
487 | ||
488 | DEST is the destination register for the and. | |
489 | SRC1 is the first and input. | |
490 | SRC2 is the second and input. | |
491 | ||
492 | Computes DEST = SRC1 rotated left by SRC2. */ | |
493 | static void | |
494 | do_rotl3 (rtx dest, rtx src1, rtx src2) | |
495 | { | |
496 | if (GET_MODE (dest) == DImode) | |
497 | emit_insn (gen_rotldi3 (dest, src1, src2)); | |
498 | else | |
499 | emit_insn (gen_rotlsi3 (dest, src1, src2)); | |
500 | } | |
501 | ||
5ec3397e AS |
502 | /* Generate rtl for a load, shift, and compare of less than a full word. |
503 | ||
504 | LOAD_MODE is the machine mode for the loads. | |
505 | DIFF is the reg for the difference. | |
506 | CMP_REM is the reg containing the remaining bytes to compare. | |
507 | DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. | |
508 | SRC1_ADDR is the first source address. | |
509 | SRC2_ADDR is the second source address. | |
510 | ORIG_SRC1 is the original first source block's address rtx. | |
511 | ORIG_SRC2 is the original second source block's address rtx. */ | |
512 | static void | |
513 | do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond, | |
514 | rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2) | |
515 | { | |
516 | HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); | |
517 | rtx shift_amount = gen_reg_rtx (word_mode); | |
518 | rtx d1 = gen_reg_rtx (word_mode); | |
519 | rtx d2 = gen_reg_rtx (word_mode); | |
520 | ||
521 | do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1); | |
522 | do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2); | |
523 | do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem); | |
524 | ||
525 | if (word_mode == DImode) | |
526 | { | |
527 | emit_insn (gen_ashldi3 (shift_amount, shift_amount, | |
528 | GEN_INT (LOG2_BITS_PER_UNIT))); | |
529 | emit_insn (gen_lshrdi3 (d1, d1, | |
530 | gen_lowpart (SImode, shift_amount))); | |
531 | emit_insn (gen_lshrdi3 (d2, d2, | |
532 | gen_lowpart (SImode, shift_amount))); | |
533 | } | |
534 | else | |
535 | { | |
536 | emit_insn (gen_ashlsi3 (shift_amount, shift_amount, | |
537 | GEN_INT (LOG2_BITS_PER_UNIT))); | |
538 | emit_insn (gen_lshrsi3 (d1, d1, shift_amount)); | |
539 | emit_insn (gen_lshrsi3 (d2, d2, shift_amount)); | |
540 | } | |
541 | ||
542 | if (TARGET_P9_MISC) | |
543 | { | |
544 | /* Generate a compare, and convert with a setb later. */ | |
545 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); | |
546 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
547 | } | |
548 | else | |
549 | { | |
550 | if (word_mode == DImode) | |
551 | emit_insn (gen_subfdi3_carry (diff, d2, d1)); | |
552 | else | |
553 | emit_insn (gen_subfsi3_carry (diff, d2, d1)); | |
554 | } | |
555 | } | |
556 | ||
557 | /* Generate rtl for an overlapping load and compare of less than a | |
558 | full load_mode. This assumes that the previous word is part of the | |
559 | block being compared so it's ok to back up part of a word so we can | |
560 | compare the last unaligned full word that ends at the end of the block. | |
561 | ||
562 | LOAD_MODE is the machine mode for the loads. | |
563 | ISCONST tells whether the remaining length is a constant or in a register. | |
564 | BYTES_REM is the remaining length if ISCONST is true. | |
565 | DIFF is the reg for the difference. | |
566 | CMP_REM is the reg containing the remaining bytes to compare if !ISCONST. | |
567 | DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. | |
568 | SRC1_ADDR is the first source address. | |
569 | SRC2_ADDR is the second source address. | |
570 | ORIG_SRC1 is the original first source block's address rtx. | |
571 | ORIG_SRC2 is the original second source block's address rtx. */ | |
572 | static void | |
573 | do_overlap_load_compare (machine_mode load_mode, bool isConst, | |
574 | HOST_WIDE_INT bytes_rem, rtx diff, | |
575 | rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr, | |
576 | rtx orig_src1, rtx orig_src2) | |
577 | { | |
578 | HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); | |
579 | HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem; | |
580 | rtx d1 = gen_reg_rtx (word_mode); | |
581 | rtx d2 = gen_reg_rtx (word_mode); | |
582 | ||
583 | rtx addr1, addr2; | |
584 | if (!isConst || addr_adj) | |
585 | { | |
586 | rtx adj_reg = gen_reg_rtx (word_mode); | |
587 | if (isConst) | |
588 | emit_move_insn (adj_reg, GEN_INT (-addr_adj)); | |
589 | else | |
590 | { | |
591 | rtx reg_lms = gen_reg_rtx (word_mode); | |
592 | emit_move_insn (reg_lms, GEN_INT (load_mode_size)); | |
593 | do_sub3 (adj_reg, cmp_rem, reg_lms); | |
594 | } | |
595 | ||
596 | addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg); | |
597 | addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg); | |
598 | } | |
599 | else | |
600 | { | |
601 | addr1 = src1_addr; | |
602 | addr2 = src2_addr; | |
603 | } | |
604 | ||
605 | do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1); | |
606 | do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2); | |
607 | ||
608 | if (TARGET_P9_MISC) | |
609 | { | |
610 | /* Generate a compare, and convert with a setb later. */ | |
611 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); | |
612 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
613 | } | |
614 | else | |
615 | { | |
616 | if (word_mode == DImode) | |
617 | emit_insn (gen_subfdi3_carry (diff, d2, d1)); | |
618 | else | |
619 | emit_insn (gen_subfsi3_carry (diff, d2, d1)); | |
620 | } | |
621 | } | |
622 | ||
37ae4739 AS |
623 | /* Generate the sequence of compares for strcmp/strncmp using vec/vsx |
624 | instructions. | |
625 | ||
626 | BYTES_TO_COMPARE is the number of bytes to be compared. | |
627 | ORIG_SRC1 is the unmodified rtx for the first string. | |
628 | ORIG_SRC2 is the unmodified rtx for the second string. | |
629 | S1ADDR is the register to use for the base address of the first string. | |
630 | S2ADDR is the register to use for the base address of the second string. | |
631 | OFF_REG is the register to use for the string offset for loads. | |
632 | S1DATA is the register for loading the first string. | |
633 | S2DATA is the register for loading the second string. | |
634 | VEC_RESULT is the rtx for the vector result indicating the byte difference. | |
635 | EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call | |
636 | to strcmp/strncmp if we have equality at the end of the inline comparison. | |
637 | P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code | |
638 | to clean up and generate the final comparison result. | |
639 | FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just | |
640 | set the final result. | |
641 | CHECKZERO indicates whether the sequence should check for zero bytes | |
642 | for use doing strncmp, or not (for use doing memcmp). */ | |
643 | static void | |
644 | expand_cmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare, | |
645 | rtx orig_src1, rtx orig_src2, | |
646 | rtx s1addr, rtx s2addr, rtx off_reg, | |
647 | rtx s1data, rtx s2data, rtx vec_result, | |
648 | bool equality_compare_rest, rtx *p_cleanup_label, | |
649 | rtx final_move_label, bool checkzero) | |
650 | { | |
651 | machine_mode load_mode; | |
652 | unsigned int load_mode_size; | |
653 | unsigned HOST_WIDE_INT cmp_bytes = 0; | |
654 | unsigned HOST_WIDE_INT offset = 0; | |
655 | rtx zero_reg = NULL; | |
656 | ||
657 | gcc_assert (p_cleanup_label != NULL); | |
658 | rtx cleanup_label = *p_cleanup_label; | |
659 | ||
660 | emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0))); | |
661 | emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0))); | |
662 | ||
663 | if (checkzero && !TARGET_P9_VECTOR) | |
664 | { | |
665 | zero_reg = gen_reg_rtx (V16QImode); | |
666 | emit_move_insn (zero_reg, CONST0_RTX (V16QImode)); | |
667 | } | |
668 | ||
669 | while (bytes_to_compare > 0) | |
670 | { | |
671 | /* VEC/VSX compare sequence for P8: | |
672 | check each 16B with: | |
673 | lxvd2x 32,28,8 | |
674 | lxvd2x 33,29,8 | |
675 | vcmpequb 2,0,1 # compare strings | |
676 | vcmpequb 4,0,3 # compare w/ 0 | |
677 | xxlorc 37,36,34 # first FF byte is either mismatch or end of string | |
678 | vcmpequb. 7,5,3 # reg 7 contains 0 | |
679 | bnl 6,.Lmismatch | |
680 | ||
681 | For the P8 LE case, we use lxvd2x and compare full 16 bytes | |
700d4cb0 | 682 | but then use vgbbd and a shift to get two bytes with the |
37ae4739 AS |
683 | information we need in the correct order. |
684 | ||
685 | VEC/VSX compare sequence if TARGET_P9_VECTOR: | |
686 | lxvb16x/lxvb16x # load 16B of each string | |
687 | vcmpnezb. # produces difference location or zero byte location | |
688 | bne 6,.Lmismatch | |
689 | ||
690 | Use the overlapping compare trick for the last block if it is | |
691 | less than 16 bytes. | |
692 | */ | |
693 | ||
694 | load_mode = V16QImode; | |
695 | load_mode_size = GET_MODE_SIZE (load_mode); | |
696 | ||
697 | if (bytes_to_compare >= load_mode_size) | |
698 | cmp_bytes = load_mode_size; | |
699 | else | |
700 | { | |
701 | /* Move this load back so it doesn't go past the end. P8/P9 | |
702 | can do this efficiently. This is never called with less | |
703 | than 16 bytes so we should always be able to do this. */ | |
704 | unsigned int extra_bytes = load_mode_size - bytes_to_compare; | |
705 | cmp_bytes = bytes_to_compare; | |
706 | gcc_assert (offset > extra_bytes); | |
707 | offset -= extra_bytes; | |
708 | cmp_bytes = load_mode_size; | |
709 | bytes_to_compare = cmp_bytes; | |
710 | } | |
711 | ||
712 | /* The offset currently used is always kept in off_reg so that the | |
713 | cleanup code on P8 can use it to extract the differing byte. */ | |
714 | emit_move_insn (off_reg, GEN_INT (offset)); | |
715 | ||
716 | rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg); | |
717 | do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1); | |
718 | rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg); | |
719 | do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2); | |
720 | ||
721 | /* Cases to handle. A and B are chunks of the two strings. | |
722 | 1: Not end of comparison: | |
723 | A != B: branch to cleanup code to compute result. | |
724 | A == B: next block | |
725 | 2: End of the inline comparison: | |
726 | A != B: branch to cleanup code to compute result. | |
727 | A == B: call strcmp/strncmp | |
728 | 3: compared requested N bytes: | |
729 | A == B: branch to result 0. | |
730 | A != B: cleanup code to compute result. */ | |
731 | ||
732 | unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; | |
733 | ||
734 | if (checkzero) | |
735 | { | |
736 | if (TARGET_P9_VECTOR) | |
737 | emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data)); | |
738 | else | |
739 | { | |
740 | /* Emit instructions to do comparison and zero check. */ | |
741 | rtx cmp_res = gen_reg_rtx (load_mode); | |
742 | rtx cmp_zero = gen_reg_rtx (load_mode); | |
743 | rtx cmp_combined = gen_reg_rtx (load_mode); | |
744 | emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data)); | |
745 | emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg)); | |
23f195b0 | 746 | emit_insn (gen_iornv16qi3 (vec_result, cmp_zero, cmp_res)); |
37ae4739 AS |
747 | emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg)); |
748 | } | |
749 | } | |
750 | else | |
751 | emit_insn (gen_altivec_vcmpequb_p (vec_result, s1data, s2data)); | |
752 | ||
753 | bool branch_to_cleanup = (remain > 0 || equality_compare_rest); | |
754 | rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO); | |
755 | rtx dst_label; | |
756 | rtx cmp_rtx; | |
757 | if (branch_to_cleanup) | |
758 | { | |
759 | /* Branch to cleanup code, otherwise fall through to do more | |
760 | compares. P8 and P9 use different CR bits because on P8 | |
761 | we are looking at the result of a comparsion vs a | |
762 | register of zeroes so the all-true condition means no | |
763 | difference or zero was found. On P9, vcmpnezb sets a byte | |
764 | to 0xff if there is a mismatch or zero, so the all-false | |
765 | condition indicates we found no difference or zero. */ | |
766 | if (!cleanup_label) | |
767 | cleanup_label = gen_label_rtx (); | |
768 | dst_label = cleanup_label; | |
769 | if (TARGET_P9_VECTOR && checkzero) | |
770 | cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx); | |
771 | else | |
772 | cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx); | |
773 | } | |
774 | else | |
775 | { | |
776 | /* Branch to final return or fall through to cleanup, | |
777 | result is already set to 0. */ | |
778 | dst_label = final_move_label; | |
779 | if (TARGET_P9_VECTOR && checkzero) | |
780 | cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx); | |
781 | else | |
782 | cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx); | |
783 | } | |
784 | ||
785 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); | |
786 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
787 | lab_ref, pc_rtx); | |
faaeebd6 AS |
788 | rtx_insn *j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); |
789 | add_reg_br_prob_note (j2, profile_probability::likely ()); | |
37ae4739 AS |
790 | JUMP_LABEL (j2) = dst_label; |
791 | LABEL_NUSES (dst_label) += 1; | |
792 | ||
793 | offset += cmp_bytes; | |
794 | bytes_to_compare -= cmp_bytes; | |
795 | } | |
796 | *p_cleanup_label = cleanup_label; | |
797 | return; | |
798 | } | |
799 | ||
800 | /* Generate the final sequence that identifies the differing | |
801 | byte and generates the final result, taking into account | |
802 | zero bytes: | |
803 | ||
804 | P8: | |
805 | vgbbd 0,0 | |
806 | vsldoi 0,0,0,9 | |
807 | mfvsrd 9,32 | |
808 | addi 10,9,-1 # count trailing zero bits | |
809 | andc 9,10,9 | |
810 | popcntd 9,9 | |
811 | lbzx 10,28,9 # use that offset to load differing byte | |
812 | lbzx 3,29,9 | |
813 | subf 3,3,10 # subtract for final result | |
814 | ||
815 | P9: | |
816 | vclzlsbb # counts trailing bytes with lsb=0 | |
817 | vextublx # extract differing byte | |
818 | ||
819 | STR1 is the reg rtx for data from string 1. | |
820 | STR2 is the reg rtx for data from string 2. | |
821 | RESULT is the reg rtx for the comparison result. | |
822 | S1ADDR is the register to use for the base address of the first string. | |
823 | S2ADDR is the register to use for the base address of the second string. | |
824 | ORIG_SRC1 is the unmodified rtx for the first string. | |
825 | ORIG_SRC2 is the unmodified rtx for the second string. | |
826 | OFF_REG is the register to use for the string offset for loads. | |
827 | VEC_RESULT is the rtx for the vector result indicating the byte difference. */ | |
828 | ||
829 | static void | |
830 | emit_final_compare_vec (rtx str1, rtx str2, rtx result, | |
831 | rtx s1addr, rtx s2addr, | |
832 | rtx orig_src1, rtx orig_src2, | |
833 | rtx off_reg, rtx vec_result) | |
834 | { | |
835 | ||
836 | if (TARGET_P9_VECTOR) | |
837 | { | |
838 | rtx diffix = gen_reg_rtx (SImode); | |
839 | rtx chr1 = gen_reg_rtx (SImode); | |
840 | rtx chr2 = gen_reg_rtx (SImode); | |
841 | rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0); | |
842 | rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0); | |
843 | emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result)); | |
844 | emit_insn (gen_vextublx (chr1, diffix, str1)); | |
845 | emit_insn (gen_vextublx (chr2, diffix, str2)); | |
846 | do_sub3 (result, chr1_di, chr2_di); | |
847 | } | |
848 | else | |
849 | { | |
850 | gcc_assert (TARGET_P8_VECTOR); | |
851 | rtx diffix = gen_reg_rtx (DImode); | |
852 | rtx result_gbbd = gen_reg_rtx (V16QImode); | |
853 | /* Since each byte of the input is either 00 or FF, the bytes in | |
854 | dw0 and dw1 after vgbbd are all identical to each other. */ | |
855 | emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result)); | |
856 | /* For LE, we shift by 9 and get BA in the low two bytes then CTZ. | |
857 | For BE, we shift by 7 and get AB in the high two bytes then CLZ. */ | |
858 | rtx result_shifted = gen_reg_rtx (V16QImode); | |
859 | int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9; | |
860 | emit_insn (gen_altivec_vsldoi_v16qi (result_shifted, result_gbbd, | |
861 | result_gbbd, GEN_INT (shift_amt))); | |
862 | ||
863 | rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0); | |
864 | emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted)); | |
865 | rtx count = gen_reg_rtx (DImode); | |
866 | ||
867 | if (BYTES_BIG_ENDIAN) | |
868 | emit_insn (gen_clzdi2 (count, diffix)); | |
869 | else | |
870 | emit_insn (gen_ctzdi2 (count, diffix)); | |
871 | ||
872 | /* P8 doesn't have a good solution for extracting one byte from | |
873 | a vsx reg like vextublx on P9 so we just compute the offset | |
874 | of the differing byte and load it from each string. */ | |
875 | do_add3 (off_reg, off_reg, count); | |
876 | ||
877 | rtx chr1 = gen_reg_rtx (QImode); | |
878 | rtx chr2 = gen_reg_rtx (QImode); | |
879 | rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg); | |
880 | do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1); | |
881 | rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg); | |
882 | do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2); | |
883 | machine_mode rmode = GET_MODE (result); | |
884 | rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0); | |
885 | rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0); | |
886 | do_sub3 (result, chr1_rm, chr2_rm); | |
887 | } | |
888 | ||
889 | return; | |
890 | } | |
891 | ||
5ec3397e AS |
892 | /* Expand a block compare operation using loop code, and return true |
893 | if successful. Return false if we should let the compiler generate | |
894 | normal code, probably a memcmp call. | |
895 | ||
896 | OPERANDS[0] is the target (result). | |
897 | OPERANDS[1] is the first source. | |
898 | OPERANDS[2] is the second source. | |
899 | OPERANDS[3] is the length. | |
900 | OPERANDS[4] is the alignment. */ | |
901 | bool | |
902 | expand_compare_loop (rtx operands[]) | |
903 | { | |
904 | rtx target = operands[0]; | |
905 | rtx orig_src1 = operands[1]; | |
906 | rtx orig_src2 = operands[2]; | |
907 | rtx bytes_rtx = operands[3]; | |
908 | rtx align_rtx = operands[4]; | |
909 | ||
910 | /* This case is complicated to handle because the subtract | |
911 | with carry instructions do not generate the 64-bit | |
912 | carry and so we must emit code to calculate it ourselves. | |
913 | We choose not to implement this yet. */ | |
914 | if (TARGET_32BIT && TARGET_POWERPC64) | |
915 | return false; | |
916 | ||
917 | /* Allow non-const length. */ | |
918 | int bytes_is_const = CONST_INT_P (bytes_rtx); | |
919 | ||
920 | /* This must be a fixed size alignment. */ | |
921 | if (!CONST_INT_P (align_rtx)) | |
922 | return false; | |
923 | ||
924 | HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; | |
925 | HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; | |
926 | HOST_WIDE_INT minalign = MIN (align1, align2); | |
927 | ||
928 | bool isP7 = (rs6000_tune == PROCESSOR_POWER7); | |
929 | ||
930 | gcc_assert (GET_MODE (target) == SImode); | |
931 | ||
932 | /* Anything to move? */ | |
933 | HOST_WIDE_INT bytes = 0; | |
934 | if (bytes_is_const) | |
935 | bytes = INTVAL (bytes_rtx); | |
936 | ||
937 | if (bytes_is_const && bytes == 0) | |
938 | return true; | |
939 | ||
940 | /* Limit the amount we compare, if known statically. */ | |
941 | HOST_WIDE_INT max_bytes; | |
942 | switch (rs6000_tune) | |
943 | { | |
944 | case PROCESSOR_POWER7: | |
945 | if (!bytes_is_const) | |
946 | if (minalign < 8) | |
947 | max_bytes = 0; | |
948 | else | |
949 | max_bytes = 128; | |
950 | else | |
951 | if (minalign < 8) | |
952 | max_bytes = 32; | |
953 | else | |
954 | max_bytes = 128; | |
955 | break; | |
956 | case PROCESSOR_POWER8: | |
957 | if (!bytes_is_const) | |
958 | max_bytes = 0; | |
959 | else | |
960 | if (minalign < 8) | |
961 | max_bytes = 128; | |
962 | else | |
963 | max_bytes = 64; | |
964 | break; | |
965 | case PROCESSOR_POWER9: | |
5d9d0c94 | 966 | case PROCESSOR_POWER10: |
05f0e9ee | 967 | case PROCESSOR_POWER11: |
5ec3397e AS |
968 | if (bytes_is_const) |
969 | max_bytes = 191; | |
970 | else | |
971 | max_bytes = 0; | |
972 | break; | |
973 | default: | |
974 | max_bytes = 128; | |
975 | } | |
976 | ||
977 | /* Allow the option to override the default. */ | |
978 | if (rs6000_block_compare_inline_loop_limit >= 0) | |
979 | max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit; | |
980 | ||
981 | if (max_bytes == 0) | |
982 | return false; | |
983 | ||
984 | rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */ | |
985 | rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */ | |
986 | HOST_WIDE_INT niter; | |
987 | rtx iter = gen_reg_rtx (word_mode); | |
988 | rtx iv1 = gen_reg_rtx (word_mode); | |
989 | rtx iv2 = gen_reg_rtx (word_mode); | |
990 | rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */ | |
991 | rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */ | |
992 | rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */ | |
993 | rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */ | |
994 | ||
995 | /* Strip unneeded subreg from length if there is one. */ | |
996 | if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx)) | |
997 | bytes_rtx = SUBREG_REG (bytes_rtx); | |
998 | /* Extend bytes_rtx to word_mode if needed. But, we expect only to | |
999 | maybe have to deal with the case were bytes_rtx is SImode and | |
1000 | word_mode is DImode. */ | |
1001 | if (!bytes_is_const) | |
1002 | { | |
1003 | if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode)) | |
1004 | /* Do not expect length longer than word_mode. */ | |
ef4adf1f | 1005 | return false; |
5ec3397e AS |
1006 | else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode)) |
1007 | { | |
1008 | bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); | |
1009 | bytes_rtx = force_reg (word_mode, | |
1010 | gen_rtx_fmt_e (ZERO_EXTEND, word_mode, | |
1011 | bytes_rtx)); | |
1012 | } | |
1013 | else | |
1014 | /* Make sure it's in a register before we get started. */ | |
1015 | bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); | |
1016 | } | |
1017 | ||
1018 | machine_mode load_mode = word_mode; | |
1019 | HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); | |
1020 | ||
1021 | /* Number of bytes per iteration of the unrolled loop. */ | |
1022 | HOST_WIDE_INT loop_bytes = 2 * load_mode_size; | |
1023 | /* max iters and bytes compared in the loop. */ | |
1024 | HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes; | |
1025 | HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes; | |
1026 | int l2lb = floor_log2 (loop_bytes); | |
1027 | ||
1028 | if (bytes_is_const && (max_bytes < load_mode_size | |
1029 | || !IN_RANGE (bytes, load_mode_size, max_bytes))) | |
1030 | return false; | |
1031 | ||
1032 | bool no_remainder_code = false; | |
1033 | rtx final_label = gen_label_rtx (); | |
1034 | rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1035 | rtx diff_label = gen_label_rtx (); | |
1036 | rtx library_call_label = NULL; | |
1037 | rtx cleanup_label = gen_label_rtx (); | |
1038 | ||
1039 | rtx cr; | |
1040 | ||
1041 | rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0)); | |
1042 | rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0)); | |
1043 | ||
1044 | /* Difference found is stored here before jump to diff_label. */ | |
1045 | rtx diff = gen_reg_rtx (word_mode); | |
faaeebd6 | 1046 | rtx_insn *j; |
5ec3397e AS |
1047 | |
1048 | /* Example of generated code for 35 bytes aligned 1 byte. | |
ef4adf1f | 1049 | |
5ec3397e AS |
1050 | mtctr 8 |
1051 | li 6,0 | |
1052 | li 5,8 | |
1053 | .L13: | |
1054 | ldbrx 7,3,6 | |
1055 | ldbrx 9,10,6 | |
1056 | ldbrx 0,3,5 | |
1057 | ldbrx 4,10,5 | |
1058 | addi 6,6,16 | |
1059 | addi 5,5,16 | |
1060 | subfc. 9,9,7 | |
1061 | bne 0,.L10 | |
1062 | subfc. 9,4,0 | |
1063 | bdnzt 2,.L13 | |
1064 | bne 0,.L10 | |
1065 | add 3,3,6 | |
1066 | add 10,10,6 | |
1067 | addi 9,3,-5 | |
1068 | ldbrx 7,0,9 | |
1069 | addi 9,10,-5 | |
1070 | ldbrx 9,0,9 | |
1071 | subfc 9,9,7 | |
1072 | .p2align 4,,15 | |
1073 | .L10: | |
1074 | popcntd 9,9 | |
1075 | subfe 10,10,10 | |
1076 | or 9,9,10 | |
ef4adf1f | 1077 | |
5ec3397e AS |
1078 | Compiled with -fno-reorder-blocks for clarity. */ |
1079 | ||
1080 | /* Structure of what we're going to do: | |
1081 | Two separate lengths: what we will compare before bailing to library | |
1082 | call (max_bytes), and the total length to be checked. | |
1083 | if length <= 16, branch to linear cleanup code starting with | |
1084 | remainder length check (length not known at compile time) | |
1085 | set up 2 iv's and load count reg, compute remainder length | |
1086 | unrollx2 compare loop | |
1087 | if loop exit due to a difference, branch to difference handling code | |
1088 | if remainder length < 8, branch to final cleanup compare | |
1089 | load and compare 8B | |
1090 | final cleanup comparison (depends on alignment and length) | |
1091 | load 8B, shift off bytes past length, compare | |
1092 | load 8B ending at last byte and compare | |
1093 | load/compare 1 byte at a time (short block abutting 4k boundary) | |
1094 | difference handling, 64->32 conversion | |
1095 | final result | |
1096 | branch around memcmp call | |
1097 | memcmp library call | |
1098 | */ | |
1099 | ||
1100 | /* If bytes is not const, compare length and branch directly | |
1101 | to the cleanup code that can handle 0-16 bytes if length | |
1102 | is >= 16. Stash away bytes-max_bytes for the library call. */ | |
1103 | if (bytes_is_const) | |
1104 | { | |
1105 | /* These need to be set for some of the places we may jump to. */ | |
1106 | if (bytes > max_bytes) | |
1107 | { | |
1108 | no_remainder_code = true; | |
1109 | niter = max_loop_iter; | |
1110 | library_call_label = gen_label_rtx (); | |
1111 | } | |
1112 | else | |
1113 | { | |
1114 | niter = bytes / loop_bytes; | |
1115 | } | |
1116 | emit_move_insn (iter, GEN_INT (niter)); | |
1117 | emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes)); | |
1118 | emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes)); | |
1119 | } | |
1120 | else | |
1121 | { | |
1122 | library_call_label = gen_label_rtx (); | |
1123 | ||
1124 | /* If we go to the cleanup code, it expects length to be in cmp_rem. */ | |
1125 | emit_move_insn (cmp_rem, bytes_rtx); | |
1126 | ||
1127 | /* Check for > max_bytes bytes. We want to bail out as quickly as | |
1128 | possible if we have to go over to memcmp. */ | |
1129 | do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes), | |
faaeebd6 | 1130 | NULL_RTX, library_call_label, profile_probability::even ()); |
5ec3397e AS |
1131 | |
1132 | /* Check for < loop_bytes bytes. */ | |
1133 | do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes), | |
faaeebd6 | 1134 | NULL_RTX, cleanup_label, profile_probability::even ()); |
5ec3397e AS |
1135 | |
1136 | /* Loop compare bytes and iterations if bytes>max_bytes. */ | |
1137 | rtx mb_reg = gen_reg_rtx (word_mode); | |
1138 | emit_move_insn (mb_reg, GEN_INT (max_loop_bytes)); | |
1139 | rtx mi_reg = gen_reg_rtx (word_mode); | |
1140 | emit_move_insn (mi_reg, GEN_INT (max_loop_iter)); | |
1141 | ||
1142 | /* Compute number of loop iterations if bytes <= max_bytes. */ | |
1143 | if (word_mode == DImode) | |
1144 | emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb))); | |
1145 | else | |
1146 | emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb))); | |
1147 | ||
1148 | /* Compute bytes to compare in loop if bytes <= max_bytes. */ | |
1149 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb); | |
1150 | if (word_mode == DImode) | |
1151 | { | |
1152 | emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask)); | |
1153 | } | |
1154 | else | |
1155 | { | |
1156 | emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask)); | |
1157 | } | |
1158 | ||
1159 | /* Check for bytes <= max_bytes. */ | |
1160 | if (TARGET_ISEL) | |
1161 | { | |
1162 | /* P9 has fast isel so we use one compare and two isel. */ | |
1163 | cr = gen_reg_rtx (CCmode); | |
1164 | rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx, | |
1165 | GEN_INT (max_bytes)); | |
1166 | emit_move_insn (cr, compare_rtx); | |
1167 | rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx); | |
1168 | do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr); | |
1169 | do_isel (iter, cmp_rtx, iter, mi_reg, cr); | |
1170 | } | |
1171 | else | |
1172 | { | |
1173 | rtx lab_after = gen_label_rtx (); | |
1174 | do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes), | |
faaeebd6 | 1175 | NULL_RTX, lab_after, profile_probability::even ()); |
5ec3397e AS |
1176 | emit_move_insn (loop_cmp, mb_reg); |
1177 | emit_move_insn (iter, mi_reg); | |
1178 | emit_label (lab_after); | |
1179 | } | |
1180 | ||
1181 | /* Now compute remainder bytes which isn't used until after the loop. */ | |
1182 | do_sub3 (cmp_rem, bytes_rtx, loop_cmp); | |
1183 | } | |
1184 | ||
1185 | rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */ | |
1186 | /* For p9 we need to have just one of these as multiple places define | |
1187 | it and it gets used by the setb at the end. */ | |
1188 | if (TARGET_P9_MISC) | |
1189 | dcond = gen_reg_rtx (CCUNSmode); | |
1190 | ||
1191 | if (!bytes_is_const || bytes >= loop_bytes) | |
1192 | { | |
1193 | /* It should not be possible to come here if remaining bytes is | |
1194 | < 16 in the runtime case either. Compute number of loop | |
1195 | iterations. We compare 2*word_mode per iteration so 16B for | |
1196 | 64-bit code and 8B for 32-bit. Set up two induction | |
1197 | variables and load count register. */ | |
1198 | ||
1199 | /* HACK ALERT: create hard reg for CTR here. If we just use a | |
1200 | pseudo, cse will get rid of it and then the allocator will | |
1201 | see it used in the lshr above and won't give us ctr. */ | |
1202 | rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); | |
1203 | emit_move_insn (ctr, iter); | |
1204 | emit_move_insn (diff, GEN_INT (0)); | |
1205 | emit_move_insn (iv1, GEN_INT (0)); | |
1206 | emit_move_insn (iv2, GEN_INT (load_mode_size)); | |
1207 | ||
1208 | /* inner loop to compare 2*word_mode */ | |
1209 | rtx loop_top_label = gen_label_rtx (); | |
1210 | emit_label (loop_top_label); | |
1211 | ||
1212 | rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1); | |
1213 | rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1); | |
1214 | ||
1215 | do_load_for_compare_from_addr (load_mode, d1_1, | |
1216 | src1_ix1, orig_src1); | |
1217 | do_load_for_compare_from_addr (load_mode, d2_1, | |
1218 | src2_ix1, orig_src2); | |
1219 | do_add3 (iv1, iv1, GEN_INT (loop_bytes)); | |
1220 | ||
1221 | rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2); | |
1222 | rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2); | |
1223 | ||
1224 | do_load_for_compare_from_addr (load_mode, d1_2, | |
1225 | src1_ix2, orig_src1); | |
1226 | do_load_for_compare_from_addr (load_mode, d2_2, | |
1227 | src2_ix2, orig_src2); | |
1228 | do_add3 (iv2, iv2, GEN_INT (loop_bytes)); | |
1229 | ||
1230 | if (TARGET_P9_MISC) | |
1231 | { | |
1232 | /* Generate a compare, and convert with a setb later. */ | |
1233 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); | |
1234 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
1235 | } | |
1236 | else | |
1237 | { | |
1238 | dcond = gen_reg_rtx (CCmode); | |
1239 | if (word_mode == DImode) | |
1240 | emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
1241 | else | |
1242 | emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
1243 | } | |
1244 | ||
1245 | do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, | |
faaeebd6 | 1246 | dcond, diff_label, profile_probability::unlikely ()); |
5ec3397e AS |
1247 | |
1248 | if (TARGET_P9_MISC) | |
1249 | { | |
1250 | /* Generate a compare, and convert with a setb later. */ | |
1251 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2); | |
1252 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
1253 | } | |
1254 | else | |
1255 | { | |
1256 | dcond = gen_reg_rtx (CCmode); | |
1257 | if (word_mode == DImode) | |
1258 | emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond)); | |
1259 | else | |
1260 | emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond)); | |
1261 | } | |
1262 | ||
1263 | rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2); | |
1264 | if (TARGET_64BIT) | |
1265 | j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr, | |
1266 | eqrtx, dcond)); | |
1267 | else | |
1268 | j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr, | |
1269 | eqrtx, dcond)); | |
faaeebd6 | 1270 | add_reg_br_prob_note (j, profile_probability::likely ()); |
5ec3397e AS |
1271 | JUMP_LABEL (j) = loop_top_label; |
1272 | LABEL_NUSES (loop_top_label) += 1; | |
1273 | } | |
1274 | ||
1275 | HOST_WIDE_INT bytes_remaining = 0; | |
1276 | if (bytes_is_const) | |
1277 | bytes_remaining = (bytes % loop_bytes); | |
1278 | ||
1279 | /* If diff is nonzero, branch to difference handling | |
1280 | code. If we exit here with a nonzero diff, it is | |
1281 | because the second word differed. */ | |
1282 | if (TARGET_P9_MISC) | |
faaeebd6 AS |
1283 | do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, |
1284 | diff_label, profile_probability::unlikely ()); | |
5ec3397e | 1285 | else |
faaeebd6 AS |
1286 | do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, |
1287 | diff_label, profile_probability::unlikely ()); | |
5ec3397e AS |
1288 | |
1289 | if (library_call_label != NULL && bytes_is_const && bytes > max_bytes) | |
1290 | { | |
1291 | /* If the length is known at compile time, then we will always | |
1292 | have a remainder to go to the library call with. */ | |
1293 | rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label); | |
1294 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref)); | |
1295 | JUMP_LABEL (j) = library_call_label; | |
1296 | LABEL_NUSES (library_call_label) += 1; | |
1297 | emit_barrier (); | |
1298 | } | |
1299 | ||
1300 | if (bytes_is_const && bytes_remaining == 0) | |
1301 | { | |
1302 | /* No remainder and if we are here then diff is 0 so just return 0 */ | |
1303 | if (TARGET_64BIT) | |
1304 | emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); | |
1305 | else | |
1306 | emit_move_insn (target, diff); | |
1307 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); | |
1308 | JUMP_LABEL (j) = final_label; | |
1309 | LABEL_NUSES (final_label) += 1; | |
1310 | emit_barrier (); | |
1311 | } | |
1312 | else if (!no_remainder_code) | |
1313 | { | |
1314 | /* Update addresses to point to the next word to examine. */ | |
1315 | do_add3 (src1_addr, src1_addr, iv1); | |
1316 | do_add3 (src2_addr, src2_addr, iv1); | |
1317 | ||
1318 | emit_label (cleanup_label); | |
1319 | ||
1320 | if (!bytes_is_const) | |
1321 | { | |
1322 | /* If we're dealing with runtime length, we have to check if | |
ef4adf1f | 1323 | it's zero after the loop. When length is known at compile |
5ec3397e AS |
1324 | time the no-remainder condition is dealt with above. By |
1325 | doing this after cleanup_label, we also deal with the | |
1326 | case where length is 0 at the start and we bypass the | |
1327 | loop with a branch to cleanup_label. */ | |
1328 | emit_move_insn (target, const0_rtx); | |
1329 | do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, | |
faaeebd6 | 1330 | NULL_RTX, final_label, profile_probability::unlikely ()); |
5ec3397e AS |
1331 | } |
1332 | ||
1333 | rtx final_cleanup = gen_label_rtx (); | |
1334 | rtx cmp_rem_before = gen_reg_rtx (word_mode); | |
1335 | /* Compare one more word_mode chunk if needed. */ | |
37ca383f | 1336 | if (!bytes_is_const || bytes_remaining >= load_mode_size) |
5ec3397e AS |
1337 | { |
1338 | /* If remainder length < word length, branch to final | |
1339 | cleanup compare. */ | |
faaeebd6 | 1340 | |
5ec3397e | 1341 | if (!bytes_is_const) |
faaeebd6 AS |
1342 | { |
1343 | do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size), | |
1344 | NULL_RTX, final_cleanup, profile_probability::even ()); | |
1345 | } | |
5ec3397e AS |
1346 | |
1347 | /* load and compare 8B */ | |
1348 | do_load_for_compare_from_addr (load_mode, d1_1, | |
1349 | src1_addr, orig_src1); | |
1350 | do_load_for_compare_from_addr (load_mode, d2_1, | |
1351 | src2_addr, orig_src2); | |
1352 | ||
1353 | /* Compare the word, see if we need to do the last partial. */ | |
1354 | if (TARGET_P9_MISC) | |
1355 | { | |
1356 | /* Generate a compare, and convert with a setb later. */ | |
1357 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); | |
1358 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
1359 | } | |
1360 | else | |
1361 | { | |
1362 | dcond = gen_reg_rtx (CCmode); | |
1363 | if (word_mode == DImode) | |
1364 | emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
1365 | else | |
1366 | emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
1367 | } | |
1368 | ||
1369 | do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, | |
faaeebd6 | 1370 | dcond, diff_label, profile_probability::even ()); |
5ec3397e AS |
1371 | |
1372 | do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size)); | |
1373 | do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size)); | |
1374 | emit_move_insn (cmp_rem_before, cmp_rem); | |
1375 | do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size)); | |
1376 | if (bytes_is_const) | |
1377 | bytes_remaining -= load_mode_size; | |
1378 | else | |
1379 | /* See if remaining length is now zero. We previously set | |
1380 | target to 0 so we can just jump to the end. */ | |
faaeebd6 AS |
1381 | do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX, |
1382 | final_label, profile_probability::unlikely ()); | |
5ec3397e AS |
1383 | } |
1384 | ||
1385 | /* Cases: | |
1386 | bytes_is_const | |
1387 | We can always shift back to do an overlapping compare | |
1388 | of the last chunk because we know length >= 8. | |
1389 | ||
1390 | !bytes_is_const | |
1391 | align>=load_mode_size | |
1392 | Read word_mode and mask | |
1393 | align<load_mode_size | |
1394 | avoid stepping past end | |
1395 | ||
1396 | Three strategies: | |
1397 | * decrement address and do overlapping compare | |
1398 | * read word_mode and mask | |
1399 | * carefully avoid crossing 4k boundary | |
1400 | */ | |
1401 | ||
1402 | if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7)) | |
1403 | && align1 >= load_mode_size && align2 >= load_mode_size) | |
1404 | { | |
1405 | /* Alignment is larger than word_mode so we do not need to be | |
1406 | concerned with extra page crossings. But, we do not know | |
1407 | that the length is larger than load_mode_size so we might | |
1408 | end up compareing against data before the block if we try | |
1409 | an overlapping compare. Also we use this on P7 for fixed length | |
1410 | remainder because P7 doesn't like overlapping unaligned. | |
1411 | Strategy: load 8B, shift off bytes past length, and compare. */ | |
1412 | emit_label (final_cleanup); | |
1413 | do_load_mask_compare (load_mode, diff, cmp_rem, dcond, | |
1414 | src1_addr, src2_addr, orig_src1, orig_src2); | |
1415 | } | |
1416 | else if (bytes_remaining && bytes_is_const) | |
1417 | { | |
1418 | /* We do not do loop expand if length < 32 so we know at the | |
1419 | end we can do an overlapping compare. | |
1420 | Strategy: shift address back and do word_mode load that | |
1421 | ends at the end of the block. */ | |
1422 | emit_label (final_cleanup); | |
1423 | do_overlap_load_compare (load_mode, true, bytes_remaining, diff, | |
1424 | cmp_rem, dcond, src1_addr, src2_addr, | |
1425 | orig_src1, orig_src2); | |
1426 | } | |
1427 | else if (!bytes_is_const) | |
1428 | { | |
1429 | rtx handle4k_label = gen_label_rtx (); | |
1430 | rtx nonconst_overlap = gen_label_rtx (); | |
1431 | emit_label (nonconst_overlap); | |
1432 | ||
1433 | /* Here we have to handle the case where whe have runtime | |
1434 | length which may be too short for overlap compare, and | |
1435 | alignment is not at least load_mode_size so we have to | |
1436 | tread carefully to avoid stepping across 4k boundaries. */ | |
1437 | ||
1438 | /* If the length after the loop was larger than word_mode | |
1439 | size, we can just do an overlapping compare and we're | |
1440 | done. We fall through to this code from the word_mode | |
1441 | compare that preceeds this. */ | |
1442 | do_overlap_load_compare (load_mode, false, 0, diff, | |
1443 | cmp_rem, dcond, src1_addr, src2_addr, | |
1444 | orig_src1, orig_src2); | |
1445 | ||
1446 | rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label); | |
1447 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); | |
1448 | JUMP_LABEL (j) = diff_label; | |
1449 | LABEL_NUSES (diff_label) += 1; | |
1450 | emit_barrier (); | |
1451 | ||
1452 | /* If we couldn't do the overlap compare we have to be more | |
1453 | careful of the 4k boundary. Test to see if either | |
1454 | address is less than word_mode_size away from a 4k | |
1455 | boundary. If not, then we can do a load/shift/compare | |
1456 | and we are done. We come to this code if length was less | |
1457 | than word_mode_size. */ | |
1458 | ||
1459 | emit_label (final_cleanup); | |
1460 | ||
1461 | /* We can still avoid the slow case if the length was larger | |
1462 | than one loop iteration, in which case go do the overlap | |
1463 | load compare path. */ | |
1464 | do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes), | |
faaeebd6 | 1465 | NULL_RTX, nonconst_overlap, profile_probability::even ()); |
5ec3397e AS |
1466 | |
1467 | rtx rem4k = gen_reg_rtx (word_mode); | |
1468 | rtx dist1 = gen_reg_rtx (word_mode); | |
1469 | rtx dist2 = gen_reg_rtx (word_mode); | |
1470 | do_sub3 (rem4k, GEN_INT (4096), cmp_rem); | |
1471 | if (word_mode == SImode) | |
1472 | emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff))); | |
1473 | else | |
1474 | emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff))); | |
faaeebd6 AS |
1475 | do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, |
1476 | handle4k_label, profile_probability::very_unlikely ()); | |
5ec3397e AS |
1477 | if (word_mode == SImode) |
1478 | emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff))); | |
1479 | else | |
1480 | emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff))); | |
faaeebd6 AS |
1481 | do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, |
1482 | handle4k_label, profile_probability::very_unlikely ()); | |
5ec3397e AS |
1483 | |
1484 | /* We don't have a 4k boundary to deal with, so do | |
1485 | a load/shift/compare and jump to diff. */ | |
1486 | ||
1487 | do_load_mask_compare (load_mode, diff, cmp_rem, dcond, | |
1488 | src1_addr, src2_addr, orig_src1, orig_src2); | |
1489 | ||
1490 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); | |
1491 | JUMP_LABEL (j) = diff_label; | |
1492 | LABEL_NUSES (diff_label) += 1; | |
1493 | emit_barrier (); | |
1494 | ||
1495 | /* Finally in the unlikely case we are inching up to a | |
1496 | 4k boundary we use a compact lbzx/compare loop to do | |
1497 | it a byte at a time. */ | |
1498 | ||
1499 | emit_label (handle4k_label); | |
1500 | ||
1501 | rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); | |
1502 | emit_move_insn (ctr, cmp_rem); | |
1503 | rtx ixreg = gen_reg_rtx (Pmode); | |
1504 | emit_move_insn (ixreg, const0_rtx); | |
1505 | ||
1506 | rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg); | |
1507 | rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg); | |
1508 | rtx d1 = gen_reg_rtx (word_mode); | |
1509 | rtx d2 = gen_reg_rtx (word_mode); | |
1510 | ||
1511 | rtx fc_loop = gen_label_rtx (); | |
1512 | emit_label (fc_loop); | |
1513 | ||
1514 | do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1); | |
1515 | do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2); | |
1516 | ||
1517 | do_add3 (ixreg, ixreg, const1_rtx); | |
1518 | ||
1519 | rtx cond = gen_reg_rtx (CCmode); | |
1520 | rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2); | |
1521 | rs6000_emit_dot_insn (diff, subexpr, 2, cond); | |
1522 | ||
1523 | rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2); | |
1524 | if (TARGET_64BIT) | |
1525 | j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr, | |
1526 | eqrtx, cond)); | |
1527 | else | |
1528 | j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr, | |
1529 | eqrtx, cond)); | |
5585759f | 1530 | add_reg_br_prob_note (j, profile_probability::likely ()); |
5ec3397e AS |
1531 | JUMP_LABEL (j) = fc_loop; |
1532 | LABEL_NUSES (fc_loop) += 1; | |
1533 | ||
1534 | if (TARGET_64BIT) | |
1535 | emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); | |
1536 | else | |
1537 | emit_move_insn (target, diff); | |
1538 | ||
1539 | /* Since we are comparing bytes, the difference can be used | |
1540 | as the final result and we are done here. */ | |
1541 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); | |
1542 | JUMP_LABEL (j) = final_label; | |
1543 | LABEL_NUSES (final_label) += 1; | |
1544 | emit_barrier (); | |
1545 | } | |
1546 | } | |
1547 | ||
1548 | emit_label (diff_label); | |
1549 | /* difference handling, 64->32 conversion */ | |
1550 | ||
1551 | /* We need to produce DI result from sub, then convert to target SI | |
1552 | while maintaining <0 / ==0 / >0 properties. This sequence works: | |
1553 | subfc L,A,B | |
1554 | subfe H,H,H | |
1555 | popcntd L,L | |
1556 | rldimi L,H,6,0 | |
1557 | ||
1558 | This is an alternate one Segher cooked up if somebody | |
1559 | wants to expand this for something that doesn't have popcntd: | |
1560 | subfc L,a,b | |
1561 | subfe H,x,x | |
1562 | addic t,L,-1 | |
1563 | subfe v,t,L | |
1564 | or z,v,H | |
1565 | ||
1566 | And finally, p9 can just do this: | |
1567 | cmpld A,B | |
1568 | setb r */ | |
1569 | ||
1570 | if (TARGET_P9_MISC) | |
1571 | emit_insn (gen_setb_unsigned (target, dcond)); | |
1572 | else | |
1573 | { | |
1574 | if (TARGET_64BIT) | |
1575 | { | |
1576 | rtx tmp_reg_ca = gen_reg_rtx (DImode); | |
1577 | emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); | |
1578 | emit_insn (gen_popcntddi2 (diff, diff)); | |
1579 | emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca)); | |
1580 | emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); | |
1581 | } | |
1582 | else | |
1583 | { | |
1584 | rtx tmp_reg_ca = gen_reg_rtx (SImode); | |
1585 | emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); | |
1586 | emit_insn (gen_popcntdsi2 (diff, diff)); | |
1587 | emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca)); | |
1588 | } | |
1589 | } | |
1590 | ||
1591 | if (library_call_label != NULL) | |
1592 | { | |
1593 | /* Branch around memcmp call. */ | |
1594 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); | |
1595 | JUMP_LABEL (j) = final_label; | |
1596 | LABEL_NUSES (final_label) += 1; | |
1597 | emit_barrier (); | |
1598 | ||
1599 | /* Make memcmp library call. cmp_rem is the remaining bytes that | |
1600 | were compared and cmp_rem is the expected amount to be compared | |
1601 | by memcmp. If we don't find a difference in the loop compare, do | |
1602 | the library call directly instead of doing a small compare just | |
1603 | to get to an arbitrary boundary before calling it anyway. | |
1604 | Also, update addresses to point to the next word to examine. */ | |
1605 | emit_label (library_call_label); | |
1606 | ||
1607 | rtx len_rtx = gen_reg_rtx (word_mode); | |
1608 | if (bytes_is_const) | |
1609 | { | |
1610 | emit_move_insn (len_rtx, cmp_rem); | |
1611 | do_add3 (src1_addr, src1_addr, iv1); | |
1612 | do_add3 (src2_addr, src2_addr, iv1); | |
1613 | } | |
1614 | else | |
1615 | emit_move_insn (len_rtx, bytes_rtx); | |
1616 | ||
1617 | tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP); | |
1618 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
1619 | target, LCT_NORMAL, GET_MODE (target), | |
1620 | src1_addr, Pmode, | |
1621 | src2_addr, Pmode, | |
1622 | len_rtx, GET_MODE (len_rtx)); | |
1623 | } | |
1624 | ||
1625 | /* emit final_label */ | |
1626 | emit_label (final_label); | |
1627 | return true; | |
1628 | } | |
1629 | ||
37ae4739 AS |
1630 | /* Generate code to convert a DImode-plus-carry subtract result into |
1631 | a SImode result that has the same <0 / ==0 / >0 properties to | |
1632 | produce the final result from memcmp. | |
8845cb37 | 1633 | |
37ae4739 AS |
1634 | TARGET is the rtx for the register to receive the memcmp result. |
1635 | SUB_RESULT is the rtx for the register contining the subtract result. */ | |
8845cb37 | 1636 | |
37ae4739 AS |
1637 | void |
1638 | generate_6432_conversion(rtx target, rtx sub_result) | |
1639 | { | |
1640 | /* We need to produce DI result from sub, then convert to target SI | |
1641 | while maintaining <0 / ==0 / >0 properties. This sequence works: | |
1642 | subfc L,A,B | |
1643 | subfe H,H,H | |
1644 | popcntd L,L | |
1645 | rldimi L,H,6,0 | |
8845cb37 | 1646 | |
37ae4739 AS |
1647 | This is an alternate one Segher cooked up if somebody |
1648 | wants to expand this for something that doesn't have popcntd: | |
1649 | subfc L,a,b | |
1650 | subfe H,x,x | |
1651 | addic t,L,-1 | |
1652 | subfe v,t,L | |
1653 | or z,v,H | |
8845cb37 | 1654 | |
37ae4739 AS |
1655 | And finally, p9 can just do this: |
1656 | cmpld A,B | |
1657 | setb r */ | |
8845cb37 | 1658 | |
37ae4739 AS |
1659 | if (TARGET_64BIT) |
1660 | { | |
1661 | rtx tmp_reg_ca = gen_reg_rtx (DImode); | |
1662 | emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); | |
1663 | rtx popcnt = gen_reg_rtx (DImode); | |
1664 | emit_insn (gen_popcntddi2 (popcnt, sub_result)); | |
1665 | rtx tmp2 = gen_reg_rtx (DImode); | |
1666 | emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca)); | |
1667 | emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2))); | |
1668 | } | |
8845cb37 | 1669 | else |
37ae4739 AS |
1670 | { |
1671 | rtx tmp_reg_ca = gen_reg_rtx (SImode); | |
1672 | emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); | |
1673 | rtx popcnt = gen_reg_rtx (SImode); | |
1674 | emit_insn (gen_popcntdsi2 (popcnt, sub_result)); | |
1675 | emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca)); | |
1676 | } | |
1677 | } | |
8845cb37 | 1678 | |
37ae4739 AS |
1679 | /* Generate memcmp expansion using in-line non-loop GPR instructions. |
1680 | The bool return indicates whether code for a 64->32 conversion | |
1681 | should be generated. | |
1682 | ||
1683 | BYTES is the number of bytes to be compared. | |
1684 | BASE_ALIGN is the minimum alignment for both blocks to compare. | |
1685 | ORIG_SRC1 is the original pointer to the first block to compare. | |
1686 | ORIG_SRC2 is the original pointer to the second block to compare. | |
1687 | SUB_RESULT is the reg rtx for the result from the final subtract. | |
1688 | COND is rtx for a condition register that will be used for the final | |
1689 | compare on power9 or better. | |
1690 | FINAL_RESULT is the reg rtx for the final memcmp result. | |
1691 | P_CONVERT_LABEL is a pointer to rtx that will be used to store the | |
1692 | label generated for a branch to the 64->32 code, if such a branch | |
1693 | is needed. | |
1694 | P_FINAL_LABEL is a pointer to rtx that will be used to store the label | |
1695 | for the end of the memcmp if a branch there is needed. | |
1696 | */ | |
8845cb37 | 1697 | |
37ae4739 AS |
1698 | bool |
1699 | expand_block_compare_gpr(unsigned HOST_WIDE_INT bytes, unsigned int base_align, | |
1700 | rtx orig_src1, rtx orig_src2, | |
1701 | rtx sub_result, rtx cond, rtx final_result, | |
1702 | rtx *p_convert_label, rtx *p_final_label) | |
1703 | { | |
8845cb37 AS |
1704 | /* Example of generated code for 18 bytes aligned 1 byte. |
1705 | Compiled with -fno-reorder-blocks for clarity. | |
1706 | ldbrx 10,31,8 | |
1707 | ldbrx 9,7,8 | |
1708 | subfc. 9,9,10 | |
1709 | bne 0,.L6487 | |
1710 | addi 9,12,8 | |
1711 | addi 5,11,8 | |
1712 | ldbrx 10,0,9 | |
1713 | ldbrx 9,0,5 | |
1714 | subfc. 9,9,10 | |
1715 | bne 0,.L6487 | |
1716 | addi 9,12,16 | |
1717 | lhbrx 10,0,9 | |
1718 | addi 9,11,16 | |
1719 | lhbrx 9,0,9 | |
1720 | subf 9,9,10 | |
1721 | b .L6488 | |
1722 | .p2align 4,,15 | |
1723 | .L6487: #convert_label | |
1724 | popcntd 9,9 | |
1725 | subfe 10,10,10 | |
1726 | or 9,9,10 | |
1727 | .L6488: #final_label | |
1728 | extsw 10,9 | |
1729 | ||
1730 | We start off with DImode for two blocks that jump to the DI->SI conversion | |
1731 | if the difference is found there, then a final block of HImode that skips | |
1732 | the DI->SI conversion. */ | |
1733 | ||
37ae4739 AS |
1734 | unsigned HOST_WIDE_INT offset = 0; |
1735 | unsigned int load_mode_size; | |
1736 | HOST_WIDE_INT cmp_bytes = 0; | |
1737 | rtx src1 = orig_src1; | |
1738 | rtx src2 = orig_src2; | |
1739 | rtx tmp_reg_src1 = gen_reg_rtx (word_mode); | |
1740 | rtx tmp_reg_src2 = gen_reg_rtx (word_mode); | |
1741 | bool need_6432_conv = false; | |
1742 | rtx convert_label = NULL; | |
1743 | rtx final_label = NULL; | |
1744 | machine_mode load_mode; | |
1745 | ||
8845cb37 AS |
1746 | while (bytes > 0) |
1747 | { | |
1748 | unsigned int align = compute_current_alignment (base_align, offset); | |
74f9986e | 1749 | load_mode = select_block_compare_mode (offset, bytes, align); |
8845cb37 AS |
1750 | load_mode_size = GET_MODE_SIZE (load_mode); |
1751 | if (bytes >= load_mode_size) | |
1752 | cmp_bytes = load_mode_size; | |
78bd9e25 HG |
1753 | else if (!targetm.slow_unaligned_access (load_mode, |
1754 | align * BITS_PER_UNIT)) | |
8845cb37 AS |
1755 | { |
1756 | /* Move this load back so it doesn't go past the end. | |
1757 | P8/P9 can do this efficiently. */ | |
1758 | unsigned int extra_bytes = load_mode_size - bytes; | |
1759 | cmp_bytes = bytes; | |
1760 | if (extra_bytes < offset) | |
1761 | { | |
1762 | offset -= extra_bytes; | |
1763 | cmp_bytes = load_mode_size; | |
1764 | bytes = cmp_bytes; | |
1765 | } | |
1766 | } | |
1767 | else | |
1768 | /* P7 and earlier can't do the overlapping load trick fast, | |
1769 | so this forces a non-overlapping load and a shift to get | |
1770 | rid of the extra bytes. */ | |
1771 | cmp_bytes = bytes; | |
1772 | ||
1773 | src1 = adjust_address (orig_src1, load_mode, offset); | |
1774 | src2 = adjust_address (orig_src2, load_mode, offset); | |
1775 | ||
1776 | if (!REG_P (XEXP (src1, 0))) | |
1777 | { | |
1778 | rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
1779 | src1 = replace_equiv_address (src1, src1_reg); | |
1780 | } | |
f4f867ca | 1781 | set_mem_size (src1, load_mode_size); |
8845cb37 AS |
1782 | |
1783 | if (!REG_P (XEXP (src2, 0))) | |
1784 | { | |
1785 | rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
1786 | src2 = replace_equiv_address (src2, src2_reg); | |
1787 | } | |
f4f867ca | 1788 | set_mem_size (src2, load_mode_size); |
8845cb37 AS |
1789 | |
1790 | do_load_for_compare (tmp_reg_src1, src1, load_mode); | |
1791 | do_load_for_compare (tmp_reg_src2, src2, load_mode); | |
1792 | ||
1793 | if (cmp_bytes < load_mode_size) | |
1794 | { | |
1795 | /* Shift unneeded bytes off. */ | |
1796 | rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes)); | |
1797 | if (word_mode == DImode) | |
1798 | { | |
1799 | emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
1800 | emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
1801 | } | |
1802 | else | |
1803 | { | |
1804 | emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
1805 | emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
1806 | } | |
1807 | } | |
1808 | ||
1809 | int remain = bytes - cmp_bytes; | |
37ae4739 | 1810 | if (GET_MODE_SIZE (GET_MODE (final_result)) > GET_MODE_SIZE (load_mode)) |
8845cb37 | 1811 | { |
37ae4739 | 1812 | /* Final_result is larger than load size so we don't need to |
8845cb37 AS |
1813 | reduce result size. */ |
1814 | ||
1815 | /* We previously did a block that need 64->32 conversion but | |
1816 | the current block does not, so a label is needed to jump | |
1817 | to the end. */ | |
37ae4739 | 1818 | if (need_6432_conv && !final_label) |
8845cb37 AS |
1819 | final_label = gen_label_rtx (); |
1820 | ||
1821 | if (remain > 0) | |
1822 | { | |
1823 | /* This is not the last block, branch to the end if the result | |
1824 | of this subtract is not zero. */ | |
1825 | if (!final_label) | |
1826 | final_label = gen_label_rtx (); | |
1827 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1828 | rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); | |
1829 | rtx cr = gen_reg_rtx (CCmode); | |
1830 | rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr); | |
37ae4739 | 1831 | emit_insn (gen_movsi (final_result, |
8845cb37 AS |
1832 | gen_lowpart (SImode, tmp_reg_src2))); |
1833 | rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx); | |
1834 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, | |
1835 | fin_ref, pc_rtx); | |
faaeebd6 AS |
1836 | rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); |
1837 | add_reg_br_prob_note (j, profile_probability::unlikely ()); | |
8845cb37 AS |
1838 | JUMP_LABEL (j) = final_label; |
1839 | LABEL_NUSES (final_label) += 1; | |
1840 | } | |
1841 | else | |
1842 | { | |
1843 | if (word_mode == DImode) | |
1844 | { | |
1845 | emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1, | |
1846 | tmp_reg_src2)); | |
37ae4739 | 1847 | emit_insn (gen_movsi (final_result, |
8845cb37 AS |
1848 | gen_lowpart (SImode, tmp_reg_src2))); |
1849 | } | |
1850 | else | |
37ae4739 | 1851 | emit_insn (gen_subsi3 (final_result, tmp_reg_src1, tmp_reg_src2)); |
8845cb37 AS |
1852 | |
1853 | if (final_label) | |
1854 | { | |
1855 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1856 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
5ec3397e | 1857 | JUMP_LABEL (j) = final_label; |
8845cb37 AS |
1858 | LABEL_NUSES (final_label) += 1; |
1859 | emit_barrier (); | |
1860 | } | |
1861 | } | |
1862 | } | |
1863 | else | |
1864 | { | |
1865 | /* Do we need a 64->32 conversion block? We need the 64->32 | |
37ae4739 | 1866 | conversion even if final_result size == load_mode size because |
8845cb37 | 1867 | the subtract generates one extra bit. */ |
37ae4739 | 1868 | need_6432_conv = true; |
8845cb37 AS |
1869 | |
1870 | if (remain > 0) | |
1871 | { | |
1872 | if (!convert_label) | |
1873 | convert_label = gen_label_rtx (); | |
1874 | ||
1875 | /* Compare to zero and branch to convert_label if not zero. */ | |
1876 | rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label); | |
1877 | if (TARGET_P9_MISC) | |
1878 | { | |
37ae4739 AS |
1879 | /* Generate a compare, and convert with a setb later. |
1880 | Use cond that is passed in because the caller needs | |
1881 | to use it for the 64->32 conversion later. */ | |
8845cb37 AS |
1882 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, |
1883 | tmp_reg_src2); | |
1884 | emit_insn (gen_rtx_SET (cond, cmp)); | |
1885 | } | |
1886 | else | |
37ae4739 AS |
1887 | { |
1888 | /* Generate a subfc. and use the longer sequence for | |
1889 | conversion. Cond is not used outside this | |
1890 | function in this case. */ | |
1891 | cond = gen_reg_rtx (CCmode); | |
1892 | if (TARGET_64BIT) | |
1893 | emit_insn (gen_subfdi3_carry_dot2 (sub_result, tmp_reg_src2, | |
1894 | tmp_reg_src1, cond)); | |
1895 | else | |
1896 | emit_insn (gen_subfsi3_carry_dot2 (sub_result, tmp_reg_src2, | |
1897 | tmp_reg_src1, cond)); | |
1898 | } | |
1899 | ||
8845cb37 AS |
1900 | rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); |
1901 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, | |
1902 | cvt_ref, pc_rtx); | |
5585759f AS |
1903 | rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); |
1904 | add_reg_br_prob_note (j, profile_probability::likely ()); | |
5ec3397e | 1905 | JUMP_LABEL (j) = convert_label; |
8845cb37 AS |
1906 | LABEL_NUSES (convert_label) += 1; |
1907 | } | |
1908 | else | |
1909 | { | |
1910 | /* Just do the subtract/compare. Since this is the last block | |
1911 | the convert code will be generated immediately following. */ | |
1912 | if (TARGET_P9_MISC) | |
1913 | { | |
1914 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, | |
1915 | tmp_reg_src2); | |
1916 | emit_insn (gen_rtx_SET (cond, cmp)); | |
1917 | } | |
1918 | else | |
1919 | if (TARGET_64BIT) | |
37ae4739 | 1920 | emit_insn (gen_subfdi3_carry (sub_result, tmp_reg_src2, |
8845cb37 AS |
1921 | tmp_reg_src1)); |
1922 | else | |
37ae4739 | 1923 | emit_insn (gen_subfsi3_carry (sub_result, tmp_reg_src2, |
8845cb37 AS |
1924 | tmp_reg_src1)); |
1925 | } | |
1926 | } | |
1927 | ||
1928 | offset += cmp_bytes; | |
1929 | bytes -= cmp_bytes; | |
1930 | } | |
1931 | ||
37ae4739 AS |
1932 | if (convert_label) |
1933 | *p_convert_label = convert_label; | |
1934 | if (final_label) | |
1935 | *p_final_label = final_label; | |
1936 | return need_6432_conv; | |
1937 | } | |
1938 | ||
1939 | /* Expand a block compare operation, and return true if successful. | |
1940 | Return false if we should let the compiler generate normal code, | |
1941 | probably a memcmp call. | |
1942 | ||
1943 | OPERANDS[0] is the target (result). | |
1944 | OPERANDS[1] is the first source. | |
1945 | OPERANDS[2] is the second source. | |
1946 | OPERANDS[3] is the length. | |
1947 | OPERANDS[4] is the alignment. */ | |
1948 | bool | |
1949 | expand_block_compare (rtx operands[]) | |
1950 | { | |
d92d26ff HG |
1951 | /* TARGET_POPCNTD is already guarded at expand cmpmemsi. */ |
1952 | gcc_assert (TARGET_POPCNTD); | |
37ae4739 | 1953 | |
464de9c2 HG |
1954 | /* For P8, this case is complicated to handle because the subtract |
1955 | with carry instructions do not generate the 64-bit carry and so | |
1956 | we must emit code to calculate it ourselves. We skip it on P8 | |
1957 | but setb works well on P9. */ | |
1958 | if (TARGET_32BIT | |
1959 | && TARGET_POWERPC64 | |
1960 | && !TARGET_P9_MISC) | |
37ae4739 AS |
1961 | return false; |
1962 | ||
37ae4739 AS |
1963 | /* Allow this param to shut off all expansion. */ |
1964 | if (rs6000_block_compare_inline_limit == 0) | |
1965 | return false; | |
1966 | ||
d92d26ff HG |
1967 | rtx target = operands[0]; |
1968 | rtx orig_src1 = operands[1]; | |
1969 | rtx orig_src2 = operands[2]; | |
1970 | rtx bytes_rtx = operands[3]; | |
1971 | rtx align_rtx = operands[4]; | |
37ae4739 | 1972 | |
d92d26ff HG |
1973 | /* targetm.slow_unaligned_access -- don't do unaligned stuff. */ |
1974 | if (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1)) | |
1975 | || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))) | |
37ae4739 AS |
1976 | return false; |
1977 | ||
1978 | /* If this is not a fixed size compare, try generating loop code and | |
1979 | if that fails just call memcmp. */ | |
1980 | if (!CONST_INT_P (bytes_rtx)) | |
1981 | return expand_compare_loop (operands); | |
1982 | ||
1983 | /* This must be a fixed size alignment. */ | |
1984 | if (!CONST_INT_P (align_rtx)) | |
1985 | return false; | |
1986 | ||
78bd9e25 HG |
1987 | unsigned int align_by_bits = UINTVAL (align_rtx); |
1988 | unsigned int base_align = align_by_bits / BITS_PER_UNIT; | |
37ae4739 AS |
1989 | |
1990 | gcc_assert (GET_MODE (target) == SImode); | |
1991 | ||
1992 | /* Anything to move? */ | |
1993 | unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx); | |
1994 | if (bytes == 0) | |
1995 | return true; | |
1996 | ||
1997 | /* P7/P8 code uses cond for subfc. but P9 uses | |
1998 | it for cmpld which needs CCUNSmode. */ | |
1999 | rtx cond = NULL; | |
2000 | if (TARGET_P9_MISC) | |
2001 | cond = gen_reg_rtx (CCUNSmode); | |
2002 | ||
2003 | /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at | |
2004 | least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is | |
2005 | at least POWER8. That way we can rely on overlapping compares to | |
2006 | do the final comparison of less than 16 bytes. Also I do not | |
2007 | want to deal with making this work for 32 bits. In addition, we | |
2008 | have to make sure that we have at least P8_VECTOR (we don't allow | |
2009 | P9_VECTOR without P8_VECTOR). */ | |
2010 | int use_vec = (bytes >= 33 && !TARGET_32BIT | |
2011 | && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR); | |
2012 | ||
2013 | /* We don't want to generate too much code. The loop code can take | |
2014 | over for lengths greater than 31 bytes. */ | |
2015 | unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit; | |
2016 | ||
2017 | /* Don't generate too much code if vsx was disabled. */ | |
2018 | if (!use_vec && max_bytes > 1) | |
2019 | max_bytes = ((max_bytes + 1) / 2) - 1; | |
2020 | ||
2021 | if (!IN_RANGE (bytes, 1, max_bytes)) | |
2022 | return expand_compare_loop (operands); | |
2023 | ||
37ae4739 AS |
2024 | rtx final_label = NULL; |
2025 | ||
2026 | if (use_vec) | |
8845cb37 | 2027 | { |
37ae4739 AS |
2028 | rtx final_move_label = gen_label_rtx (); |
2029 | rtx s1addr = gen_reg_rtx (Pmode); | |
2030 | rtx s2addr = gen_reg_rtx (Pmode); | |
2031 | rtx off_reg = gen_reg_rtx (Pmode); | |
2032 | rtx cleanup_label = NULL; | |
2033 | rtx vec_result = gen_reg_rtx (V16QImode); | |
2034 | rtx s1data = gen_reg_rtx (V16QImode); | |
2035 | rtx s2data = gen_reg_rtx (V16QImode); | |
2036 | rtx result_reg = gen_reg_rtx (word_mode); | |
2037 | emit_move_insn (result_reg, GEN_INT (0)); | |
8845cb37 | 2038 | |
37ae4739 AS |
2039 | expand_cmp_vec_sequence (bytes, orig_src1, orig_src2, |
2040 | s1addr, s2addr, off_reg, s1data, s2data, | |
2041 | vec_result, false, | |
2042 | &cleanup_label, final_move_label, false); | |
2043 | ||
2044 | if (cleanup_label) | |
2045 | emit_label (cleanup_label); | |
2046 | ||
2047 | emit_insn (gen_one_cmplv16qi2 (vec_result, vec_result)); | |
2048 | ||
2049 | emit_final_compare_vec (s1data, s2data, result_reg, | |
2050 | s1addr, s2addr, orig_src1, orig_src2, | |
2051 | off_reg, vec_result); | |
2052 | ||
2053 | emit_label (final_move_label); | |
2054 | emit_insn (gen_movsi (target, | |
2055 | gen_lowpart (SImode, result_reg))); | |
2056 | } | |
2057 | else | |
2058 | { /* generate GPR code */ | |
2059 | ||
2060 | rtx convert_label = NULL; | |
2061 | rtx sub_result = gen_reg_rtx (word_mode); | |
2062 | bool need_6432_conversion = | |
2063 | expand_block_compare_gpr(bytes, base_align, | |
2064 | orig_src1, orig_src2, | |
2065 | sub_result, cond, target, | |
2066 | &convert_label, &final_label); | |
2067 | ||
2068 | if (need_6432_conversion) | |
8845cb37 | 2069 | { |
37ae4739 AS |
2070 | if (convert_label) |
2071 | emit_label (convert_label); | |
2072 | if (TARGET_P9_MISC) | |
2073 | emit_insn (gen_setb_unsigned (target, cond)); | |
8845cb37 | 2074 | else |
37ae4739 | 2075 | generate_6432_conversion(target, sub_result); |
8845cb37 AS |
2076 | } |
2077 | } | |
2078 | ||
2079 | if (final_label) | |
2080 | emit_label (final_label); | |
2081 | ||
8845cb37 AS |
2082 | return true; |
2083 | } | |
2084 | ||
f7e94dfb | 2085 | /* Generate page crossing check and branch code to set up for |
8845cb37 AS |
2086 | strncmp when we don't have DI alignment. |
2087 | STRNCMP_LABEL is the label to branch if there is a page crossing. | |
f7e94dfb | 2088 | SRC_ADDR is the string address to be examined. |
8845cb37 AS |
2089 | BYTES is the max number of bytes to compare. */ |
2090 | static void | |
f7e94dfb | 2091 | expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes) |
8845cb37 AS |
2092 | { |
2093 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label); | |
f7e94dfb AS |
2094 | rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr)); |
2095 | do_and3 (src_pgoff, src_addr, GEN_INT (0xfff)); | |
8845cb37 | 2096 | rtx cond = gen_reg_rtx (CCmode); |
f7e94dfb | 2097 | emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff, |
8845cb37 AS |
2098 | GEN_INT (4096 - bytes))); |
2099 | ||
0c791c59 | 2100 | rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx); |
8845cb37 AS |
2101 | |
2102 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
0c791c59 | 2103 | lab_ref, pc_rtx); |
faaeebd6 AS |
2104 | rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); |
2105 | add_reg_br_prob_note (j, profile_probability::unlikely ()); | |
8845cb37 AS |
2106 | JUMP_LABEL (j) = strncmp_label; |
2107 | LABEL_NUSES (strncmp_label) += 1; | |
2108 | } | |
2109 | ||
74f9986e AS |
2110 | /* Generate the sequence of compares for strcmp/strncmp using gpr instructions. |
2111 | BYTES_TO_COMPARE is the number of bytes to be compared. | |
2112 | BASE_ALIGN is the smaller of the alignment of the two strings. | |
2113 | ORIG_SRC1 is the unmodified rtx for the first string. | |
2114 | ORIG_SRC2 is the unmodified rtx for the second string. | |
2115 | TMP_REG_SRC1 is the register for loading the first string. | |
2116 | TMP_REG_SRC2 is the register for loading the second string. | |
2117 | RESULT_REG is the rtx for the result register. | |
2118 | EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call | |
2119 | to strcmp/strncmp if we have equality at the end of the inline comparison. | |
9d36bd3b AS |
2120 | P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code |
2121 | to clean up and generate the final comparison result. | |
ef4adf1f | 2122 | FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just |
74f9986e AS |
2123 | set the final result. */ |
2124 | static void | |
9d36bd3b AS |
2125 | expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, |
2126 | unsigned int base_align, | |
2127 | rtx orig_src1, rtx orig_src2, | |
2128 | rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg, | |
2129 | bool equality_compare_rest, rtx *p_cleanup_label, | |
2130 | rtx final_move_label) | |
74f9986e AS |
2131 | { |
2132 | unsigned int word_mode_size = GET_MODE_SIZE (word_mode); | |
2133 | machine_mode load_mode; | |
2134 | unsigned int load_mode_size; | |
2135 | unsigned HOST_WIDE_INT cmp_bytes = 0; | |
2136 | unsigned HOST_WIDE_INT offset = 0; | |
2137 | rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0)); | |
2138 | rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0)); | |
9d36bd3b AS |
2139 | gcc_assert (p_cleanup_label != NULL); |
2140 | rtx cleanup_label = *p_cleanup_label; | |
74f9986e AS |
2141 | |
2142 | while (bytes_to_compare > 0) | |
2143 | { | |
2144 | /* GPR compare sequence: | |
ef4adf1f AS |
2145 | check each 8B with: ld/ld/cmpb/cmpb/orc./bne |
2146 | ||
74f9986e | 2147 | cleanup code at end: |
74f9986e AS |
2148 | cntlzd get bit of first zero/diff byte |
2149 | subfic convert for rldcl use | |
2150 | rldcl rldcl extract diff/zero byte | |
2151 | subf subtract for final result | |
2152 | ||
2153 | The last compare can branch around the cleanup code if the | |
2154 | result is zero because the strings are exactly equal. */ | |
ef4adf1f | 2155 | |
74f9986e AS |
2156 | unsigned int align = compute_current_alignment (base_align, offset); |
2157 | load_mode = select_block_compare_mode (offset, bytes_to_compare, align); | |
2158 | load_mode_size = GET_MODE_SIZE (load_mode); | |
2159 | if (bytes_to_compare >= load_mode_size) | |
2160 | cmp_bytes = load_mode_size; | |
78bd9e25 HG |
2161 | else if (!targetm.slow_unaligned_access (load_mode, |
2162 | align * BITS_PER_UNIT)) | |
74f9986e AS |
2163 | { |
2164 | /* Move this load back so it doesn't go past the end. | |
2165 | P8/P9 can do this efficiently. */ | |
2166 | unsigned int extra_bytes = load_mode_size - bytes_to_compare; | |
2167 | cmp_bytes = bytes_to_compare; | |
2168 | if (extra_bytes < offset) | |
2169 | { | |
2170 | offset -= extra_bytes; | |
2171 | cmp_bytes = load_mode_size; | |
2172 | bytes_to_compare = cmp_bytes; | |
2173 | } | |
2174 | } | |
2175 | else | |
2176 | /* P7 and earlier can't do the overlapping load trick fast, | |
2177 | so this forces a non-overlapping load and a shift to get | |
2178 | rid of the extra bytes. */ | |
2179 | cmp_bytes = bytes_to_compare; | |
2180 | ||
122d6c36 AS |
2181 | rtx offset_rtx; |
2182 | if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM) | |
2183 | offset_rtx = GEN_INT (offset); | |
2184 | else | |
2185 | { | |
2186 | offset_rtx = gen_reg_rtx (Pmode); | |
2187 | emit_move_insn (offset_rtx, GEN_INT (offset)); | |
2188 | } | |
2189 | rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx); | |
2190 | rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx); | |
37ae4739 | 2191 | |
74f9986e | 2192 | do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1); |
74f9986e AS |
2193 | do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2); |
2194 | ||
2195 | /* We must always left-align the data we read, and | |
2196 | clear any bytes to the right that are beyond the string. | |
2197 | Otherwise the cmpb sequence won't produce the correct | |
ef4adf1f AS |
2198 | results. However if there is only one byte left, we |
2199 | can just subtract to get the final result so the shifts | |
2200 | and clears are not needed. */ | |
74f9986e | 2201 | |
ef4adf1f | 2202 | unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; |
74f9986e | 2203 | |
ef4adf1f AS |
2204 | /* Loading just a single byte is a special case. If we are |
2205 | loading more than that, we have to check whether we are | |
2206 | looking at the entire chunk of data. If not, rotate left and | |
2207 | clear right so that bytes we aren't supposed to look at are | |
2208 | zeroed, and the first byte we are supposed to compare is | |
2209 | leftmost. */ | |
2210 | if (load_mode_size != 1) | |
74f9986e | 2211 | { |
ef4adf1f AS |
2212 | if (load_mode_size < word_mode_size) |
2213 | { | |
2214 | /* Rotate left first. */ | |
2215 | rtx sh = GEN_INT (BITS_PER_UNIT | |
2216 | * (word_mode_size - load_mode_size)); | |
2217 | do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh); | |
2218 | do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh); | |
2219 | } | |
2220 | ||
2221 | if (cmp_bytes < word_mode_size) | |
2222 | { | |
2223 | /* Now clear right. This plus the rotate can be | |
2224 | turned into a rldicr instruction. */ | |
2225 | HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes); | |
2226 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); | |
2227 | do_and3 (tmp_reg_src1, tmp_reg_src1, mask); | |
2228 | do_and3 (tmp_reg_src2, tmp_reg_src2, mask); | |
2229 | } | |
74f9986e AS |
2230 | } |
2231 | ||
2232 | /* Cases to handle. A and B are chunks of the two strings. | |
2233 | 1: Not end of comparison: | |
2234 | A != B: branch to cleanup code to compute result. | |
2235 | A == B: check for 0 byte, next block if not found. | |
2236 | 2: End of the inline comparison: | |
2237 | A != B: branch to cleanup code to compute result. | |
2238 | A == B: check for 0 byte, call strcmp/strncmp | |
2239 | 3: compared requested N bytes: | |
2240 | A == B: branch to result 0. | |
2241 | A != B: cleanup code to compute result. */ | |
2242 | ||
74f9986e AS |
2243 | rtx dst_label; |
2244 | if (remain > 0 || equality_compare_rest) | |
2245 | { | |
2246 | /* Branch to cleanup code, otherwise fall through to do | |
2247 | more compares. */ | |
2248 | if (!cleanup_label) | |
2249 | cleanup_label = gen_label_rtx (); | |
2250 | dst_label = cleanup_label; | |
2251 | } | |
2252 | else | |
2253 | /* Branch to end and produce result of 0. */ | |
2254 | dst_label = final_move_label; | |
2255 | ||
ef4adf1f AS |
2256 | if (load_mode_size == 1) |
2257 | { | |
2258 | /* Special case for comparing just single byte. */ | |
2259 | if (equality_compare_rest) | |
2260 | { | |
2261 | /* Use subf./bne to branch to final_move_label if the | |
2262 | byte differs, otherwise fall through to the strncmp | |
2263 | call. We must also check for a zero byte here as we | |
2264 | must not make the library call if this is the end of | |
2265 | the string. */ | |
2266 | ||
2267 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); | |
2268 | rtx cond = gen_reg_rtx (CCmode); | |
2269 | rtx diff_rtx = gen_rtx_MINUS (word_mode, | |
2270 | tmp_reg_src1, tmp_reg_src2); | |
2271 | rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond); | |
2272 | rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
2273 | ||
2274 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
2275 | lab_ref, pc_rtx); | |
faaeebd6 AS |
2276 | rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); |
2277 | add_reg_br_prob_note (j, profile_probability::unlikely ()); | |
ef4adf1f AS |
2278 | JUMP_LABEL (j) = final_move_label; |
2279 | LABEL_NUSES (final_move_label) += 1; | |
74f9986e | 2280 | |
ef4adf1f AS |
2281 | /* Check for zero byte here before fall through to |
2282 | library call. This catches the case where the | |
2283 | strings are equal and end in a zero byte at this | |
2284 | position. */ | |
74f9986e | 2285 | |
ef4adf1f AS |
2286 | rtx cond0 = gen_reg_rtx (CCmode); |
2287 | emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1, | |
2288 | const0_rtx)); | |
74f9986e | 2289 | |
ef4adf1f | 2290 | rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx); |
74f9986e | 2291 | |
ef4adf1f AS |
2292 | rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx, |
2293 | lab_ref, pc_rtx); | |
faaeebd6 AS |
2294 | rtx_insn *j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0)); |
2295 | add_reg_br_prob_note (j0, profile_probability::unlikely ()); | |
ef4adf1f AS |
2296 | JUMP_LABEL (j0) = final_move_label; |
2297 | LABEL_NUSES (final_move_label) += 1; | |
2298 | } | |
2299 | else | |
2300 | { | |
2301 | /* This is the last byte to be compared so we can use | |
2302 | subf to compute the final result and branch | |
2303 | unconditionally to final_move_label. */ | |
2304 | ||
2305 | do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2); | |
2306 | ||
2307 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); | |
2308 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
2309 | JUMP_LABEL (j) = final_move_label; | |
2310 | LABEL_NUSES (final_move_label) += 1; | |
2311 | emit_barrier (); | |
2312 | } | |
2313 | } | |
2314 | else | |
74f9986e | 2315 | { |
74f9986e | 2316 | rtx cmpb_zero = gen_reg_rtx (word_mode); |
ef4adf1f | 2317 | rtx cmpb_diff = gen_reg_rtx (word_mode); |
74f9986e | 2318 | rtx zero_reg = gen_reg_rtx (word_mode); |
ef4adf1f AS |
2319 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); |
2320 | rtx cond = gen_reg_rtx (CCmode); | |
2321 | ||
74f9986e | 2322 | emit_move_insn (zero_reg, GEN_INT (0)); |
ef4adf1f | 2323 | do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2); |
74f9986e | 2324 | do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg); |
ef4adf1f AS |
2325 | rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff); |
2326 | rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero); | |
74f9986e | 2327 | |
ef4adf1f | 2328 | rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond); |
74f9986e | 2329 | |
ef4adf1f AS |
2330 | rtx cmp_rtx; |
2331 | if (remain == 0 && !equality_compare_rest) | |
2332 | cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx); | |
2333 | else | |
2334 | cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
74f9986e | 2335 | |
ef4adf1f AS |
2336 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, |
2337 | lab_ref, pc_rtx); | |
faaeebd6 AS |
2338 | rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); |
2339 | add_reg_br_prob_note (j, profile_probability::unlikely ()); | |
ef4adf1f AS |
2340 | JUMP_LABEL (j) = dst_label; |
2341 | LABEL_NUSES (dst_label) += 1; | |
74f9986e AS |
2342 | } |
2343 | ||
2344 | offset += cmp_bytes; | |
2345 | bytes_to_compare -= cmp_bytes; | |
2346 | } | |
2347 | ||
9d36bd3b AS |
2348 | *p_cleanup_label = cleanup_label; |
2349 | return; | |
2350 | } | |
2351 | ||
f7e94dfb AS |
2352 | /* Generate the final sequence that identifies the differing |
2353 | byte and generates the final result, taking into account | |
2354 | zero bytes: | |
ef4adf1f | 2355 | |
f7e94dfb AS |
2356 | cntlzd get bit of first zero/diff byte |
2357 | addi convert for rldcl use | |
2358 | rldcl rldcl extract diff/zero byte | |
2359 | subf subtract for final result | |
2360 | ||
2361 | STR1 is the reg rtx for data from string 1. | |
2362 | STR2 is the reg rtx for data from string 2. | |
2363 | RESULT is the reg rtx for the comparison result. */ | |
2364 | ||
2365 | static void | |
2366 | emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result) | |
2367 | { | |
2368 | machine_mode m = GET_MODE (str1); | |
f7e94dfb | 2369 | rtx rot_amt = gen_reg_rtx (m); |
f7e94dfb AS |
2370 | |
2371 | rtx rot1_1 = gen_reg_rtx (m); | |
2372 | rtx rot1_2 = gen_reg_rtx (m); | |
2373 | rtx rot2_1 = gen_reg_rtx (m); | |
2374 | rtx rot2_2 = gen_reg_rtx (m); | |
2375 | ||
2376 | if (m == SImode) | |
2377 | { | |
ef4adf1f | 2378 | emit_insn (gen_clzsi2 (rot_amt, result)); |
f7e94dfb AS |
2379 | emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8))); |
2380 | emit_insn (gen_rotlsi3 (rot1_1, str1, | |
2381 | gen_lowpart (SImode, rot_amt))); | |
2382 | emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); | |
2383 | emit_insn (gen_rotlsi3 (rot2_1, str2, | |
2384 | gen_lowpart (SImode, rot_amt))); | |
2385 | emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); | |
2386 | emit_insn (gen_subsi3 (result, rot1_2, rot2_2)); | |
2387 | } | |
2388 | else if (m == DImode) | |
2389 | { | |
ef4adf1f | 2390 | emit_insn (gen_clzdi2 (rot_amt, result)); |
f7e94dfb AS |
2391 | emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8))); |
2392 | emit_insn (gen_rotldi3 (rot1_1, str1, | |
2393 | gen_lowpart (SImode, rot_amt))); | |
2394 | emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); | |
2395 | emit_insn (gen_rotldi3 (rot2_1, str2, | |
2396 | gen_lowpart (SImode, rot_amt))); | |
2397 | emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); | |
2398 | emit_insn (gen_subdi3 (result, rot1_2, rot2_2)); | |
2399 | } | |
2400 | else | |
2401 | gcc_unreachable (); | |
ef4adf1f | 2402 | |
f7e94dfb AS |
2403 | return; |
2404 | } | |
2405 | ||
8845cb37 | 2406 | /* Expand a string compare operation with length, and return |
ef4adf1f | 2407 | true if successful. Return false if we should let the |
8845cb37 AS |
2408 | compiler generate normal code, probably a strncmp call. |
2409 | ||
2410 | OPERANDS[0] is the target (result). | |
2411 | OPERANDS[1] is the first source. | |
2412 | OPERANDS[2] is the second source. | |
2413 | If NO_LENGTH is zero, then: | |
2414 | OPERANDS[3] is the length. | |
2415 | OPERANDS[4] is the alignment in bytes. | |
2416 | If NO_LENGTH is nonzero, then: | |
2417 | OPERANDS[3] is the alignment in bytes. */ | |
2418 | bool | |
2419 | expand_strn_compare (rtx operands[], int no_length) | |
2420 | { | |
2421 | rtx target = operands[0]; | |
2422 | rtx orig_src1 = operands[1]; | |
2423 | rtx orig_src2 = operands[2]; | |
2424 | rtx bytes_rtx, align_rtx; | |
2425 | if (no_length) | |
2426 | { | |
2427 | bytes_rtx = NULL; | |
2428 | align_rtx = operands[3]; | |
2429 | } | |
2430 | else | |
2431 | { | |
2432 | bytes_rtx = operands[3]; | |
2433 | align_rtx = operands[4]; | |
2434 | } | |
74f9986e | 2435 | |
f7e94dfb AS |
2436 | rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0)); |
2437 | rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0)); | |
8845cb37 | 2438 | |
ef4adf1f | 2439 | /* If we have a length, it must be constant. This simplifies things |
8845cb37 | 2440 | a bit as we don't have to generate code to check if we've exceeded |
ef4adf1f | 2441 | the length. Later this could be expanded to handle this case. */ |
8845cb37 AS |
2442 | if (!no_length && !CONST_INT_P (bytes_rtx)) |
2443 | return false; | |
2444 | ||
2445 | /* This must be a fixed size alignment. */ | |
2446 | if (!CONST_INT_P (align_rtx)) | |
2447 | return false; | |
2448 | ||
2449 | unsigned int base_align = UINTVAL (align_rtx); | |
f7e94dfb AS |
2450 | unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; |
2451 | unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; | |
8845cb37 | 2452 | |
e0bd6c9f RS |
2453 | /* targetm.slow_unaligned_access -- don't do unaligned stuff. */ |
2454 | if (targetm.slow_unaligned_access (word_mode, align1) | |
2455 | || targetm.slow_unaligned_access (word_mode, align2)) | |
8845cb37 AS |
2456 | return false; |
2457 | ||
2458 | gcc_assert (GET_MODE (target) == SImode); | |
2459 | ||
9d36bd3b | 2460 | unsigned int required_align = 8; |
8845cb37 AS |
2461 | |
2462 | unsigned HOST_WIDE_INT offset = 0; | |
2463 | unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */ | |
2464 | unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */ | |
9d36bd3b | 2465 | |
8845cb37 | 2466 | if (no_length) |
9d36bd3b | 2467 | bytes = rs6000_string_compare_inline_limit; |
8845cb37 AS |
2468 | else |
2469 | bytes = UINTVAL (bytes_rtx); | |
2470 | ||
ef4adf1f | 2471 | /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at |
9d36bd3b AS |
2472 | least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is |
2473 | at least POWER8. That way we can rely on overlapping compares to | |
6bd2b8ec AS |
2474 | do the final comparison of less than 16 bytes. Also I do not |
2475 | want to deal with making this work for 32 bits. In addition, we | |
2476 | have to make sure that we have at least P8_VECTOR (we don't allow | |
2477 | P9_VECTOR without P8_VECTOR). */ | |
2478 | int use_vec = (bytes >= 16 && !TARGET_32BIT | |
2479 | && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR); | |
9d36bd3b AS |
2480 | |
2481 | if (use_vec) | |
2482 | required_align = 16; | |
2483 | ||
2484 | machine_mode load_mode; | |
2485 | rtx tmp_reg_src1, tmp_reg_src2; | |
2486 | if (use_vec) | |
2487 | { | |
2488 | load_mode = V16QImode; | |
2489 | tmp_reg_src1 = gen_reg_rtx (V16QImode); | |
2490 | tmp_reg_src2 = gen_reg_rtx (V16QImode); | |
2491 | } | |
2492 | else | |
2493 | { | |
2494 | load_mode = select_block_compare_mode (0, bytes, base_align); | |
2495 | tmp_reg_src1 = gen_reg_rtx (word_mode); | |
2496 | tmp_reg_src2 = gen_reg_rtx (word_mode); | |
2497 | } | |
2498 | ||
2499 | compare_length = rs6000_string_compare_inline_limit; | |
8845cb37 AS |
2500 | |
2501 | /* If we have equality at the end of the last compare and we have not | |
2502 | found the end of the string, we need to call strcmp/strncmp to | |
2503 | compare the remainder. */ | |
2504 | bool equality_compare_rest = false; | |
2505 | ||
2506 | if (no_length) | |
2507 | { | |
2508 | bytes = compare_length; | |
2509 | equality_compare_rest = true; | |
2510 | } | |
2511 | else | |
2512 | { | |
2513 | if (bytes <= compare_length) | |
2514 | compare_length = bytes; | |
2515 | else | |
2516 | equality_compare_rest = true; | |
2517 | } | |
2518 | ||
2519 | rtx result_reg = gen_reg_rtx (word_mode); | |
2520 | rtx final_move_label = gen_label_rtx (); | |
2521 | rtx final_label = gen_label_rtx (); | |
2522 | rtx begin_compare_label = NULL; | |
ef4adf1f | 2523 | |
f7e94dfb | 2524 | if (base_align < required_align) |
8845cb37 AS |
2525 | { |
2526 | /* Generate code that checks distance to 4k boundary for this case. */ | |
2527 | begin_compare_label = gen_label_rtx (); | |
2528 | rtx strncmp_label = gen_label_rtx (); | |
2529 | rtx jmp; | |
2530 | ||
2531 | /* Strncmp for power8 in glibc does this: | |
5ec3397e AS |
2532 | rldicl r8,r3,0,52 |
2533 | cmpldi cr7,r8,4096-16 | |
2534 | bgt cr7,L(pagecross) */ | |
8845cb37 AS |
2535 | |
2536 | /* Make sure that the length we use for the alignment test and | |
2537 | the subsequent code generation are in agreement so we do not | |
2538 | go past the length we tested for a 4k boundary crossing. */ | |
2539 | unsigned HOST_WIDE_INT align_test = compare_length; | |
9d36bd3b | 2540 | if (align_test < required_align) |
8845cb37 AS |
2541 | { |
2542 | align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test); | |
2543 | base_align = align_test; | |
2544 | } | |
2545 | else | |
2546 | { | |
f7e94dfb AS |
2547 | align_test = ROUND_UP (align_test, required_align); |
2548 | base_align = required_align; | |
8845cb37 AS |
2549 | } |
2550 | ||
f7e94dfb AS |
2551 | if (align1 < required_align) |
2552 | expand_strncmp_align_check (strncmp_label, src1_addr, align_test); | |
2553 | if (align2 < required_align) | |
2554 | expand_strncmp_align_check (strncmp_label, src2_addr, align_test); | |
8845cb37 AS |
2555 | |
2556 | /* Now generate the following sequence: | |
2557 | - branch to begin_compare | |
2558 | - strncmp_label | |
2559 | - call to strncmp | |
2560 | - branch to final_label | |
2561 | - begin_compare_label */ | |
2562 | ||
2563 | rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label); | |
2564 | jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref)); | |
2565 | JUMP_LABEL (jmp) = begin_compare_label; | |
2566 | LABEL_NUSES (begin_compare_label) += 1; | |
2567 | emit_barrier (); | |
2568 | ||
2569 | emit_label (strncmp_label); | |
2570 | ||
8845cb37 AS |
2571 | if (no_length) |
2572 | { | |
2573 | tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); | |
2574 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2575 | target, LCT_NORMAL, GET_MODE (target), |
f7e94dfb AS |
2576 | force_reg (Pmode, src1_addr), Pmode, |
2577 | force_reg (Pmode, src2_addr), Pmode); | |
8845cb37 AS |
2578 | } |
2579 | else | |
2580 | { | |
2581 | /* -m32 -mpowerpc64 results in word_mode being DImode even | |
9d36bd3b | 2582 | though otherwise it is 32-bit. The length arg to strncmp |
8845cb37 | 2583 | is a size_t which will be the same size as pointers. */ |
e9727bda AS |
2584 | rtx len_rtx = gen_reg_rtx (Pmode); |
2585 | emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode)); | |
8845cb37 AS |
2586 | |
2587 | tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); | |
2588 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2589 | target, LCT_NORMAL, GET_MODE (target), |
f7e94dfb AS |
2590 | force_reg (Pmode, src1_addr), Pmode, |
2591 | force_reg (Pmode, src2_addr), Pmode, | |
e9727bda | 2592 | len_rtx, Pmode); |
8845cb37 AS |
2593 | } |
2594 | ||
2595 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
2596 | jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
2597 | JUMP_LABEL (jmp) = final_label; | |
2598 | LABEL_NUSES (final_label) += 1; | |
2599 | emit_barrier (); | |
2600 | emit_label (begin_compare_label); | |
2601 | } | |
2602 | ||
2603 | rtx cleanup_label = NULL; | |
9d36bd3b | 2604 | rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL; |
8845cb37 | 2605 | |
f7e94dfb | 2606 | /* Generate a sequence of GPR or VEC/VSX instructions to compare out |
8845cb37 | 2607 | to the length specified. */ |
9d36bd3b AS |
2608 | if (use_vec) |
2609 | { | |
2610 | s1addr = gen_reg_rtx (Pmode); | |
2611 | s2addr = gen_reg_rtx (Pmode); | |
2612 | off_reg = gen_reg_rtx (Pmode); | |
2613 | vec_result = gen_reg_rtx (load_mode); | |
2614 | emit_move_insn (result_reg, GEN_INT (0)); | |
37ae4739 AS |
2615 | expand_cmp_vec_sequence (compare_length, |
2616 | orig_src1, orig_src2, | |
2617 | s1addr, s2addr, off_reg, | |
2618 | tmp_reg_src1, tmp_reg_src2, | |
2619 | vec_result, | |
2620 | equality_compare_rest, | |
2621 | &cleanup_label, final_move_label, true); | |
9d36bd3b AS |
2622 | } |
2623 | else | |
2624 | expand_strncmp_gpr_sequence (compare_length, base_align, | |
2625 | orig_src1, orig_src2, | |
2626 | tmp_reg_src1, tmp_reg_src2, | |
2627 | result_reg, | |
2628 | equality_compare_rest, | |
2629 | &cleanup_label, final_move_label); | |
74f9986e AS |
2630 | |
2631 | offset = compare_length; | |
ef4adf1f | 2632 | |
8845cb37 AS |
2633 | if (equality_compare_rest) |
2634 | { | |
2635 | /* Update pointers past what has been compared already. */ | |
f7e94dfb AS |
2636 | rtx src1 = force_reg (Pmode, |
2637 | gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset))); | |
2638 | rtx src2 = force_reg (Pmode, | |
2639 | gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset))); | |
8845cb37 AS |
2640 | |
2641 | /* Construct call to strcmp/strncmp to compare the rest of the string. */ | |
2642 | if (no_length) | |
2643 | { | |
2644 | tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); | |
2645 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2646 | target, LCT_NORMAL, GET_MODE (target), |
f7e94dfb | 2647 | src1, Pmode, src2, Pmode); |
8845cb37 AS |
2648 | } |
2649 | else | |
2650 | { | |
e9727bda AS |
2651 | rtx len_rtx = gen_reg_rtx (Pmode); |
2652 | emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode)); | |
8845cb37 AS |
2653 | tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); |
2654 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2655 | target, LCT_NORMAL, GET_MODE (target), |
e9727bda | 2656 | src1, Pmode, src2, Pmode, len_rtx, Pmode); |
8845cb37 AS |
2657 | } |
2658 | ||
2659 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
2660 | rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
2661 | JUMP_LABEL (jmp) = final_label; | |
2662 | LABEL_NUSES (final_label) += 1; | |
2663 | emit_barrier (); | |
2664 | } | |
2665 | ||
2666 | if (cleanup_label) | |
2667 | emit_label (cleanup_label); | |
2668 | ||
9d36bd3b | 2669 | if (use_vec) |
37ae4739 AS |
2670 | emit_final_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg, |
2671 | s1addr, s2addr, orig_src1, orig_src2, | |
2672 | off_reg, vec_result); | |
9d36bd3b AS |
2673 | else |
2674 | emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg); | |
8845cb37 AS |
2675 | |
2676 | emit_label (final_move_label); | |
2677 | emit_insn (gen_movsi (target, | |
2678 | gen_lowpart (SImode, result_reg))); | |
2679 | emit_label (final_label); | |
2680 | return true; | |
2681 | } | |
2682 | ||
19db0ebb AS |
2683 | /* Generate loads and stores for a move of v4si mode using lvx/stvx. |
2684 | This uses altivec_{l,st}vx_<mode>_internal which use unspecs to | |
2685 | keep combine from changing what instruction gets used. | |
2686 | ||
2687 | DEST is the destination for the data. | |
2688 | SRC is the source of the data for the move. */ | |
2689 | ||
2690 | static rtx | |
2691 | gen_lvx_v4si_move (rtx dest, rtx src) | |
2692 | { | |
2693 | gcc_assert (MEM_P (dest) ^ MEM_P (src)); | |
2694 | gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode); | |
2695 | ||
2696 | if (MEM_P (dest)) | |
2697 | return gen_altivec_stvx_v4si_internal (dest, src); | |
2698 | else | |
2699 | return gen_altivec_lvx_v4si_internal (dest, src); | |
2700 | } | |
2701 | ||
afd97163 AS |
2702 | static rtx |
2703 | gen_lxvl_stxvl_move (rtx dest, rtx src, int length) | |
2704 | { | |
2705 | gcc_assert (MEM_P (dest) ^ MEM_P (src)); | |
2706 | gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode); | |
2707 | gcc_assert (length <= 16); | |
2708 | ||
2709 | bool is_store = MEM_P (dest); | |
2710 | rtx addr; | |
2711 | ||
2712 | /* If the address form is not a simple register, make it so. */ | |
2713 | if (is_store) | |
2714 | addr = XEXP (dest, 0); | |
2715 | else | |
2716 | addr = XEXP (src, 0); | |
2717 | ||
2718 | if (!REG_P (addr)) | |
2719 | addr = force_reg (Pmode, addr); | |
2720 | ||
2721 | rtx len = force_reg (DImode, gen_int_mode (length, DImode)); | |
2722 | if (is_store) | |
2723 | return gen_stxvl (src, addr, len); | |
2724 | else | |
2725 | return gen_lxvl (dest, addr, len); | |
2726 | } | |
2727 | ||
8845cb37 AS |
2728 | /* Expand a block move operation, and return 1 if successful. Return 0 |
2729 | if we should let the compiler generate normal code. | |
2730 | ||
2731 | operands[0] is the destination | |
2732 | operands[1] is the source | |
2733 | operands[2] is the length | |
2734 | operands[3] is the alignment */ | |
2735 | ||
2736 | #define MAX_MOVE_REG 4 | |
2737 | ||
2738 | int | |
c8241327 | 2739 | expand_block_move (rtx operands[], bool might_overlap) |
8845cb37 AS |
2740 | { |
2741 | rtx orig_dest = operands[0]; | |
2742 | rtx orig_src = operands[1]; | |
2743 | rtx bytes_rtx = operands[2]; | |
2744 | rtx align_rtx = operands[3]; | |
2e42a52f | 2745 | int constp = CONST_INT_P (bytes_rtx); |
8845cb37 AS |
2746 | int align; |
2747 | int bytes; | |
2748 | int offset; | |
2749 | int move_bytes; | |
c8241327 | 2750 | rtx loads[MAX_MOVE_REG]; |
8845cb37 AS |
2751 | rtx stores[MAX_MOVE_REG]; |
2752 | int num_reg = 0; | |
2753 | ||
2754 | /* If this is not a fixed size move, just call memcpy */ | |
2755 | if (! constp) | |
2756 | return 0; | |
2757 | ||
2758 | /* This must be a fixed size alignment */ | |
2e42a52f | 2759 | gcc_assert (CONST_INT_P (align_rtx)); |
8845cb37 AS |
2760 | align = INTVAL (align_rtx) * BITS_PER_UNIT; |
2761 | ||
2762 | /* Anything to move? */ | |
2763 | bytes = INTVAL (bytes_rtx); | |
2764 | if (bytes <= 0) | |
2765 | return 1; | |
2766 | ||
2767 | if (bytes > rs6000_block_move_inline_limit) | |
2768 | return 0; | |
2769 | ||
afd97163 | 2770 | int orig_bytes = bytes; |
8845cb37 AS |
2771 | for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes) |
2772 | { | |
2773 | union { | |
8845cb37 | 2774 | rtx (*mov) (rtx, rtx); |
afd97163 | 2775 | rtx (*movlen) (rtx, rtx, int); |
8845cb37 AS |
2776 | } gen_func; |
2777 | machine_mode mode = BLKmode; | |
2778 | rtx src, dest; | |
afd97163 AS |
2779 | bool move_with_length = false; |
2780 | ||
f8f8909a | 2781 | /* Use OOmode for paired vsx load/store. Use V2DI for single |
afd97163 AS |
2782 | unaligned vsx load/store, for consistency with what other |
2783 | expansions (compare) already do, and so we can use lxvd2x on | |
2784 | p8. Order is VSX pair unaligned, VSX unaligned, Altivec, VSX | |
2785 | with length < 16 (if allowed), then gpr load/store. */ | |
2786 | ||
2787 | if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX | |
2788 | && TARGET_BLOCK_OPS_VECTOR_PAIR | |
2789 | && bytes >= 32 | |
2790 | && (align >= 256 || !STRICT_ALIGNMENT)) | |
2791 | { | |
2792 | move_bytes = 32; | |
f8f8909a AS |
2793 | mode = OOmode; |
2794 | gen_func.mov = gen_movoo; | |
afd97163 AS |
2795 | } |
2796 | else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX | |
2797 | && VECTOR_MEM_VSX_P (V2DImode) | |
2798 | && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT)) | |
2799 | { | |
2800 | move_bytes = 16; | |
2801 | mode = V2DImode; | |
2802 | gen_func.mov = gen_vsx_movv2di_64bit; | |
2803 | } | |
2804 | else if (TARGET_BLOCK_OPS_UNALIGNED_VSX | |
946b8967 HG |
2805 | /* Only use lxvl/stxvl on 64bit POWER10. */ |
2806 | && TARGET_POWER10 | |
2807 | && TARGET_64BIT | |
2808 | && bytes < 16 | |
afd97163 | 2809 | && orig_bytes > 16 |
946b8967 HG |
2810 | && !(bytes == 1 |
2811 | || bytes == 2 | |
2812 | || bytes == 4 | |
2813 | || bytes == 8) | |
2814 | && (align >= 128 | |
2815 | || !STRICT_ALIGNMENT)) | |
afd97163 AS |
2816 | { |
2817 | /* Only use lxvl/stxvl if it could replace multiple ordinary | |
2818 | loads+stores. Also don't use it unless we likely already | |
2819 | did one vsx copy so we aren't mixing gpr and vsx. */ | |
2820 | move_bytes = bytes; | |
2821 | mode = V16QImode; | |
2822 | gen_func.movlen = gen_lxvl_stxvl_move; | |
2823 | move_with_length = true; | |
2824 | } | |
2825 | else if (TARGET_ALTIVEC && bytes >= 16 && align >= 128) | |
8845cb37 AS |
2826 | { |
2827 | move_bytes = 16; | |
2828 | mode = V4SImode; | |
19db0ebb | 2829 | gen_func.mov = gen_lvx_v4si_move; |
8845cb37 | 2830 | } |
8845cb37 AS |
2831 | else if (bytes >= 8 && TARGET_POWERPC64 |
2832 | && (align >= 64 || !STRICT_ALIGNMENT)) | |
2833 | { | |
2834 | move_bytes = 8; | |
2835 | mode = DImode; | |
2836 | gen_func.mov = gen_movdi; | |
2837 | if (offset == 0 && align < 64) | |
2838 | { | |
2839 | rtx addr; | |
2840 | ||
2841 | /* If the address form is reg+offset with offset not a | |
2842 | multiple of four, reload into reg indirect form here | |
2843 | rather than waiting for reload. This way we get one | |
2844 | reload, not one per load and/or store. */ | |
2845 | addr = XEXP (orig_dest, 0); | |
2846 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
2e42a52f | 2847 | && CONST_INT_P (XEXP (addr, 1)) |
8845cb37 AS |
2848 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) |
2849 | { | |
2850 | addr = copy_addr_to_reg (addr); | |
2851 | orig_dest = replace_equiv_address (orig_dest, addr); | |
2852 | } | |
2853 | addr = XEXP (orig_src, 0); | |
2854 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
2e42a52f | 2855 | && CONST_INT_P (XEXP (addr, 1)) |
8845cb37 AS |
2856 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) |
2857 | { | |
2858 | addr = copy_addr_to_reg (addr); | |
2859 | orig_src = replace_equiv_address (orig_src, addr); | |
2860 | } | |
2861 | } | |
2862 | } | |
8845cb37 AS |
2863 | else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) |
2864 | { /* move 4 bytes */ | |
2865 | move_bytes = 4; | |
2866 | mode = SImode; | |
2867 | gen_func.mov = gen_movsi; | |
2868 | } | |
2869 | else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) | |
2870 | { /* move 2 bytes */ | |
2871 | move_bytes = 2; | |
2872 | mode = HImode; | |
2873 | gen_func.mov = gen_movhi; | |
2874 | } | |
8845cb37 AS |
2875 | else /* move 1 byte at a time */ |
2876 | { | |
2877 | move_bytes = 1; | |
2878 | mode = QImode; | |
2879 | gen_func.mov = gen_movqi; | |
2880 | } | |
2881 | ||
afd97163 AS |
2882 | /* If we can't succeed in doing the move in one pass, we can't |
2883 | do it in the might_overlap case. Bail out and return | |
2884 | failure. We test num_reg + 1 >= MAX_MOVE_REG here to check | |
2885 | the same condition as the test of num_reg >= MAX_MOVE_REG | |
2886 | that is done below after the increment of num_reg. */ | |
2887 | if (might_overlap && num_reg + 1 >= MAX_MOVE_REG | |
2888 | && bytes > move_bytes) | |
2889 | return 0; | |
2890 | ||
2891 | /* Mode is always set to something other than BLKmode by one of the | |
c8241327 AS |
2892 | cases of the if statement above. */ |
2893 | gcc_assert (mode != BLKmode); | |
2894 | ||
8845cb37 AS |
2895 | src = adjust_address (orig_src, mode, offset); |
2896 | dest = adjust_address (orig_dest, mode, offset); | |
2897 | ||
c8241327 | 2898 | rtx tmp_reg = gen_reg_rtx (mode); |
8845cb37 | 2899 | |
afd97163 AS |
2900 | if (move_with_length) |
2901 | { | |
2902 | loads[num_reg] = (*gen_func.movlen) (tmp_reg, src, move_bytes); | |
2903 | stores[num_reg++] = (*gen_func.movlen) (dest, tmp_reg, move_bytes); | |
2904 | } | |
2905 | else | |
2906 | { | |
2907 | loads[num_reg] = (*gen_func.mov) (tmp_reg, src); | |
2908 | stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); | |
2909 | } | |
8845cb37 | 2910 | |
c8241327 AS |
2911 | /* Emit loads and stores saved up. */ |
2912 | if (num_reg >= MAX_MOVE_REG || bytes == move_bytes) | |
8845cb37 AS |
2913 | { |
2914 | int i; | |
c8241327 AS |
2915 | for (i = 0; i < num_reg; i++) |
2916 | emit_insn (loads[i]); | |
8845cb37 AS |
2917 | for (i = 0; i < num_reg; i++) |
2918 | emit_insn (stores[i]); | |
2919 | num_reg = 0; | |
2920 | } | |
c8241327 | 2921 | |
8845cb37 AS |
2922 | } |
2923 | ||
2924 | return 1; | |
2925 | } |