]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/rs6000/rs6000-string.cc
rs6000: Call library for block memory compare when optimizing for size
[thirdparty/gcc.git] / gcc / config / rs6000 / rs6000-string.cc
CommitLineData
8845cb37
AS
1/* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
83ffe9cd 3 Copyright (C) 1991-2023 Free Software Foundation, Inc.
8845cb37
AS
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
8845cb37
AS
23#include "config.h"
24#include "system.h"
25#include "coretypes.h"
26#include "backend.h"
27#include "rtl.h"
28#include "tree.h"
29#include "memmodel.h"
30#include "tm_p.h"
31#include "ira.h"
32#include "print-tree.h"
33#include "varasm.h"
34#include "explow.h"
35#include "expr.h"
36#include "output.h"
e0bd6c9f 37#include "target.h"
faaeebd6
AS
38#include "profile-count.h"
39#include "predict.h"
8845cb37
AS
40
41/* Expand a block clear operation, and return 1 if successful. Return 0
42 if we should let the compiler generate normal code.
43
44 operands[0] is the destination
45 operands[1] is the length
46 operands[3] is the alignment */
47
48int
49expand_block_clear (rtx operands[])
50{
51 rtx orig_dest = operands[0];
52 rtx bytes_rtx = operands[1];
53 rtx align_rtx = operands[3];
2e42a52f 54 bool constp = CONST_INT_P (bytes_rtx);
8845cb37
AS
55 HOST_WIDE_INT align;
56 HOST_WIDE_INT bytes;
57 int offset;
58 int clear_bytes;
59 int clear_step;
60
61 /* If this is not a fixed size move, just call memcpy */
62 if (! constp)
63 return 0;
64
65 /* This must be a fixed size alignment */
2e42a52f 66 gcc_assert (CONST_INT_P (align_rtx));
8845cb37
AS
67 align = INTVAL (align_rtx) * BITS_PER_UNIT;
68
69 /* Anything to clear? */
70 bytes = INTVAL (bytes_rtx);
71 if (bytes <= 0)
72 return 1;
73
74 /* Use the builtin memset after a point, to avoid huge code bloat.
75 When optimize_size, avoid any significant code bloat; calling
76 memset is about 4 instructions, so allow for one instruction to
77 load zero and three to do clearing. */
3b0cb1a5 78 if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
8845cb37
AS
79 clear_step = 16;
80 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
81 clear_step = 8;
82 else
83 clear_step = 4;
84
85 if (optimize_size && bytes > 3 * clear_step)
86 return 0;
87 if (! optimize_size && bytes > 8 * clear_step)
88 return 0;
89
645eee74
AS
90 bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
91
8845cb37
AS
92 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
93 {
94 machine_mode mode = BLKmode;
95 rtx dest;
96
31369f5a 97 if (TARGET_ALTIVEC
645eee74 98 && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
8845cb37
AS
99 {
100 clear_bytes = 16;
101 mode = V4SImode;
102 }
103 else if (bytes >= 8 && TARGET_POWERPC64
104 && (align >= 64 || !STRICT_ALIGNMENT))
105 {
106 clear_bytes = 8;
107 mode = DImode;
108 if (offset == 0 && align < 64)
109 {
110 rtx addr;
111
112 /* If the address form is reg+offset with offset not a
113 multiple of four, reload into reg indirect form here
114 rather than waiting for reload. This way we get one
115 reload, not one per store. */
116 addr = XEXP (orig_dest, 0);
117 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2e42a52f 118 && CONST_INT_P (XEXP (addr, 1))
8845cb37
AS
119 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
120 {
121 addr = copy_addr_to_reg (addr);
122 orig_dest = replace_equiv_address (orig_dest, addr);
123 }
124 }
125 }
126 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
127 { /* move 4 bytes */
128 clear_bytes = 4;
129 mode = SImode;
130 }
131 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
132 { /* move 2 bytes */
133 clear_bytes = 2;
134 mode = HImode;
135 }
136 else /* move 1 byte at a time */
137 {
138 clear_bytes = 1;
139 mode = QImode;
140 }
141
142 dest = adjust_address (orig_dest, mode, offset);
143
144 emit_move_insn (dest, CONST0_RTX (mode));
145 }
146
147 return 1;
148}
149
150/* Figure out the correct instructions to generate to load data for
151 block compare. MODE is used for the read from memory, and
152 data is zero extended if REG is wider than MODE. If LE code
153 is being generated, bswap loads are used.
154
155 REG is the destination register to move the data into.
156 MEM is the memory block being read.
157 MODE is the mode of memory to use for the read. */
158static void
159do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
160{
161 switch (GET_MODE (reg))
162 {
9d36bd3b
AS
163 case E_V16QImode:
164 switch (mode)
165 {
166 case E_V16QImode:
167 if (!BYTES_BIG_ENDIAN)
168 {
169 if (TARGET_P9_VECTOR)
170 emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem));
171 else
172 {
173 rtx reg_v2di = simplify_gen_subreg (V2DImode, reg,
174 V16QImode, 0);
175 gcc_assert (MEM_P (mem));
176 rtx addr = XEXP (mem, 0);
177 rtx mem_v2di = gen_rtx_MEM (V2DImode, addr);
178 MEM_COPY_ATTRIBUTES (mem_v2di, mem);
179 set_mem_size (mem, GET_MODE_SIZE (V2DImode));
180 emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di));
181 }
182 }
183 else
184 emit_insn (gen_vsx_movv2di_64bit (reg, mem));
185 break;
186 default:
187 gcc_unreachable ();
188 }
189 break;
4e10a5a7 190 case E_DImode:
8845cb37
AS
191 switch (mode)
192 {
4e10a5a7 193 case E_QImode:
8845cb37
AS
194 emit_insn (gen_zero_extendqidi2 (reg, mem));
195 break;
4e10a5a7 196 case E_HImode:
8845cb37
AS
197 {
198 rtx src = mem;
199 if (!BYTES_BIG_ENDIAN)
200 {
201 src = gen_reg_rtx (HImode);
202 emit_insn (gen_bswaphi2 (src, mem));
203 }
204 emit_insn (gen_zero_extendhidi2 (reg, src));
205 break;
206 }
4e10a5a7 207 case E_SImode:
8845cb37
AS
208 {
209 rtx src = mem;
210 if (!BYTES_BIG_ENDIAN)
211 {
212 src = gen_reg_rtx (SImode);
213 emit_insn (gen_bswapsi2 (src, mem));
214 }
215 emit_insn (gen_zero_extendsidi2 (reg, src));
216 }
217 break;
4e10a5a7 218 case E_DImode:
8845cb37
AS
219 if (!BYTES_BIG_ENDIAN)
220 emit_insn (gen_bswapdi2 (reg, mem));
221 else
222 emit_insn (gen_movdi (reg, mem));
223 break;
224 default:
225 gcc_unreachable ();
226 }
227 break;
228
4e10a5a7 229 case E_SImode:
8845cb37
AS
230 switch (mode)
231 {
4e10a5a7 232 case E_QImode:
8845cb37
AS
233 emit_insn (gen_zero_extendqisi2 (reg, mem));
234 break;
4e10a5a7 235 case E_HImode:
8845cb37
AS
236 {
237 rtx src = mem;
238 if (!BYTES_BIG_ENDIAN)
239 {
240 src = gen_reg_rtx (HImode);
241 emit_insn (gen_bswaphi2 (src, mem));
242 }
243 emit_insn (gen_zero_extendhisi2 (reg, src));
244 break;
245 }
4e10a5a7 246 case E_SImode:
8845cb37
AS
247 if (!BYTES_BIG_ENDIAN)
248 emit_insn (gen_bswapsi2 (reg, mem));
249 else
250 emit_insn (gen_movsi (reg, mem));
251 break;
4e10a5a7 252 case E_DImode:
8845cb37
AS
253 /* DImode is larger than the destination reg so is not expected. */
254 gcc_unreachable ();
255 break;
256 default:
257 gcc_unreachable ();
258 }
259 break;
9d36bd3b
AS
260
261 case E_QImode:
262 gcc_assert (mode == E_QImode);
263 emit_move_insn (reg, mem);
264 break;
ef4adf1f 265
8845cb37
AS
266 default:
267 gcc_unreachable ();
268 break;
269 }
270}
271
272/* Select the mode to be used for reading the next chunk of bytes
273 in the compare.
274
275 OFFSET is the current read offset from the beginning of the block.
276 BYTES is the number of bytes remaining to be read.
74f9986e 277 ALIGN is the minimum alignment of the memory blocks being compared in bytes. */
8845cb37
AS
278static machine_mode
279select_block_compare_mode (unsigned HOST_WIDE_INT offset,
280 unsigned HOST_WIDE_INT bytes,
74f9986e 281 unsigned HOST_WIDE_INT align)
8845cb37
AS
282{
283 /* First see if we can do a whole load unit
284 as that will be more efficient than a larger load + shift. */
285
286 /* If big, use biggest chunk.
287 If exactly chunk size, use that size.
288 If remainder can be done in one piece with shifting, do that.
289 Do largest chunk possible without violating alignment rules. */
290
291 /* The most we can read without potential page crossing. */
292 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
293
74f9986e
AS
294 /* If we have an LE target without ldbrx and word_mode is DImode,
295 then we must avoid using word_mode. */
296 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
297 && word_mode == DImode);
298
8845cb37
AS
299 if (word_mode_ok && bytes >= UNITS_PER_WORD)
300 return word_mode;
301 else if (bytes == GET_MODE_SIZE (SImode))
302 return SImode;
303 else if (bytes == GET_MODE_SIZE (HImode))
304 return HImode;
305 else if (bytes == GET_MODE_SIZE (QImode))
306 return QImode;
307 else if (bytes < GET_MODE_SIZE (SImode)
78bd9e25 308 && !targetm.slow_unaligned_access (SImode, align * BITS_PER_UNIT)
8845cb37
AS
309 && offset >= GET_MODE_SIZE (SImode) - bytes)
310 /* This matches the case were we have SImode and 3 bytes
311 and offset >= 1 and permits us to move back one and overlap
312 with the previous read, thus avoiding having to shift
313 unwanted bytes off of the input. */
314 return SImode;
315 else if (word_mode_ok && bytes < UNITS_PER_WORD
78bd9e25 316 && !targetm.slow_unaligned_access (word_mode, align * BITS_PER_UNIT)
8845cb37
AS
317 && offset >= UNITS_PER_WORD-bytes)
318 /* Similarly, if we can use DImode it will get matched here and
319 can do an overlapping read that ends at the end of the block. */
320 return word_mode;
321 else if (word_mode_ok && maxread >= UNITS_PER_WORD)
322 /* It is safe to do all remaining in one load of largest size,
323 possibly with a shift to get rid of unwanted bytes. */
324 return word_mode;
325 else if (maxread >= GET_MODE_SIZE (SImode))
326 /* It is safe to do all remaining in one SImode load,
327 possibly with a shift to get rid of unwanted bytes. */
328 return SImode;
329 else if (bytes > GET_MODE_SIZE (SImode))
330 return SImode;
331 else if (bytes > GET_MODE_SIZE (HImode))
332 return HImode;
333
334 /* final fallback is do one byte */
335 return QImode;
336}
337
338/* Compute the alignment of pointer+OFFSET where the original alignment
339 of pointer was BASE_ALIGN. */
340static unsigned HOST_WIDE_INT
341compute_current_alignment (unsigned HOST_WIDE_INT base_align,
342 unsigned HOST_WIDE_INT offset)
343{
344 if (offset == 0)
345 return base_align;
346 return MIN (base_align, offset & -offset);
347}
348
5ec3397e
AS
349/* Prepare address and then do a load.
350
351 MODE is the mode to use for the load.
352 DEST is the destination register for the data.
353 ADDR is the address to be loaded.
354 ORIG_ADDR is the original address expression. */
355static void
356do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
357 rtx orig_addr)
358{
359 rtx mem = gen_rtx_MEM (mode, addr);
360 MEM_COPY_ATTRIBUTES (mem, orig_addr);
361 set_mem_size (mem, GET_MODE_SIZE (mode));
362 do_load_for_compare (dest, mem, mode);
363 return;
364}
365
366/* Do a branch for an if/else decision.
367
368 CMPMODE is the mode to use for the comparison.
369 COMPARISON is the rtx code for the compare needed.
370 A is the first thing to be compared.
371 B is the second thing to be compared.
372 CR is the condition code reg input, or NULL_RTX.
373 TRUE_LABEL is the label to branch to if the condition is true.
faaeebd6 374 P is the estimated branch probability for the branch.
5ec3397e
AS
375
376 The return value is the CR used for the comparison.
377 If CR is null_rtx, then a new register of CMPMODE is generated.
378 If A and B are both null_rtx, then CR must not be null, and the
379 compare is not generated so you can use this with a dot form insn. */
380
381static void
382do_ifelse (machine_mode cmpmode, rtx_code comparison,
faaeebd6 383 rtx a, rtx b, rtx cr, rtx true_label, profile_probability br_prob)
5ec3397e
AS
384{
385 gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
386 || (a != NULL_RTX && b != NULL_RTX));
387
388 if (cr != NULL_RTX)
389 gcc_assert (GET_MODE (cr) == cmpmode);
390 else
391 cr = gen_reg_rtx (cmpmode);
392
393 rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
394
395 if (a != NULL_RTX)
396 emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
397
398 rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
399
400 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
faaeebd6
AS
401 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
402 add_reg_br_prob_note (j, br_prob);
5ec3397e
AS
403 JUMP_LABEL (j) = true_label;
404 LABEL_NUSES (true_label) += 1;
405}
406
407/* Emit an isel of the proper mode for DEST.
408
409 DEST is the isel destination register.
410 SRC1 is the isel source if CR is true.
411 SRC2 is the isel source if CR is false.
412 CR is the condition for the isel. */
413static void
414do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
415{
416 if (GET_MODE (dest) == DImode)
4ba3902e 417 emit_insn (gen_isel_cc_di (dest, cmp, src_t, src_f, cr));
5ec3397e 418 else
4ba3902e 419 emit_insn (gen_isel_cc_si (dest, cmp, src_t, src_f, cr));
5ec3397e
AS
420}
421
422/* Emit a subtract of the proper mode for DEST.
423
424 DEST is the destination register for the subtract.
425 SRC1 is the first subtract input.
426 SRC2 is the second subtract input.
427
428 Computes DEST = SRC1-SRC2. */
429static void
430do_sub3 (rtx dest, rtx src1, rtx src2)
431{
432 if (GET_MODE (dest) == DImode)
433 emit_insn (gen_subdi3 (dest, src1, src2));
434 else
435 emit_insn (gen_subsi3 (dest, src1, src2));
436}
437
438/* Emit an add of the proper mode for DEST.
439
440 DEST is the destination register for the add.
441 SRC1 is the first add input.
442 SRC2 is the second add input.
443
444 Computes DEST = SRC1+SRC2. */
445static void
446do_add3 (rtx dest, rtx src1, rtx src2)
447{
448 if (GET_MODE (dest) == DImode)
449 emit_insn (gen_adddi3 (dest, src1, src2));
450 else
451 emit_insn (gen_addsi3 (dest, src1, src2));
452}
453
f7e94dfb
AS
454/* Emit an and of the proper mode for DEST.
455
456 DEST is the destination register for the and.
457 SRC1 is the first and input.
458 SRC2 is the second and input.
459
460 Computes DEST = SRC1&SRC2. */
461static void
462do_and3 (rtx dest, rtx src1, rtx src2)
463{
464 if (GET_MODE (dest) == DImode)
465 emit_insn (gen_anddi3 (dest, src1, src2));
466 else
467 emit_insn (gen_andsi3 (dest, src1, src2));
468}
469
470/* Emit an cmpb of the proper mode for DEST.
471
472 DEST is the destination register for the cmpb.
473 SRC1 is the first input.
474 SRC2 is the second input.
475
476 Computes cmpb of SRC1, SRC2. */
477static void
478do_cmpb3 (rtx dest, rtx src1, rtx src2)
479{
480 if (GET_MODE (dest) == DImode)
481 emit_insn (gen_cmpbdi3 (dest, src1, src2));
482 else
483 emit_insn (gen_cmpbsi3 (dest, src1, src2));
484}
485
486/* Emit a rotl of the proper mode for DEST.
487
488 DEST is the destination register for the and.
489 SRC1 is the first and input.
490 SRC2 is the second and input.
491
492 Computes DEST = SRC1 rotated left by SRC2. */
493static void
494do_rotl3 (rtx dest, rtx src1, rtx src2)
495{
496 if (GET_MODE (dest) == DImode)
497 emit_insn (gen_rotldi3 (dest, src1, src2));
498 else
499 emit_insn (gen_rotlsi3 (dest, src1, src2));
500}
501
5ec3397e
AS
502/* Generate rtl for a load, shift, and compare of less than a full word.
503
504 LOAD_MODE is the machine mode for the loads.
505 DIFF is the reg for the difference.
506 CMP_REM is the reg containing the remaining bytes to compare.
507 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
508 SRC1_ADDR is the first source address.
509 SRC2_ADDR is the second source address.
510 ORIG_SRC1 is the original first source block's address rtx.
511 ORIG_SRC2 is the original second source block's address rtx. */
512static void
513do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
514 rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
515{
516 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
517 rtx shift_amount = gen_reg_rtx (word_mode);
518 rtx d1 = gen_reg_rtx (word_mode);
519 rtx d2 = gen_reg_rtx (word_mode);
520
521 do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
522 do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
523 do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
524
525 if (word_mode == DImode)
526 {
527 emit_insn (gen_ashldi3 (shift_amount, shift_amount,
528 GEN_INT (LOG2_BITS_PER_UNIT)));
529 emit_insn (gen_lshrdi3 (d1, d1,
530 gen_lowpart (SImode, shift_amount)));
531 emit_insn (gen_lshrdi3 (d2, d2,
532 gen_lowpart (SImode, shift_amount)));
533 }
534 else
535 {
536 emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
537 GEN_INT (LOG2_BITS_PER_UNIT)));
538 emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
539 emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
540 }
541
542 if (TARGET_P9_MISC)
543 {
544 /* Generate a compare, and convert with a setb later. */
545 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
546 emit_insn (gen_rtx_SET (dcond, cmp));
547 }
548 else
549 {
550 if (word_mode == DImode)
551 emit_insn (gen_subfdi3_carry (diff, d2, d1));
552 else
553 emit_insn (gen_subfsi3_carry (diff, d2, d1));
554 }
555}
556
557/* Generate rtl for an overlapping load and compare of less than a
558 full load_mode. This assumes that the previous word is part of the
559 block being compared so it's ok to back up part of a word so we can
560 compare the last unaligned full word that ends at the end of the block.
561
562 LOAD_MODE is the machine mode for the loads.
563 ISCONST tells whether the remaining length is a constant or in a register.
564 BYTES_REM is the remaining length if ISCONST is true.
565 DIFF is the reg for the difference.
566 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
567 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
568 SRC1_ADDR is the first source address.
569 SRC2_ADDR is the second source address.
570 ORIG_SRC1 is the original first source block's address rtx.
571 ORIG_SRC2 is the original second source block's address rtx. */
572static void
573do_overlap_load_compare (machine_mode load_mode, bool isConst,
574 HOST_WIDE_INT bytes_rem, rtx diff,
575 rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
576 rtx orig_src1, rtx orig_src2)
577{
578 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
579 HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
580 rtx d1 = gen_reg_rtx (word_mode);
581 rtx d2 = gen_reg_rtx (word_mode);
582
583 rtx addr1, addr2;
584 if (!isConst || addr_adj)
585 {
586 rtx adj_reg = gen_reg_rtx (word_mode);
587 if (isConst)
588 emit_move_insn (adj_reg, GEN_INT (-addr_adj));
589 else
590 {
591 rtx reg_lms = gen_reg_rtx (word_mode);
592 emit_move_insn (reg_lms, GEN_INT (load_mode_size));
593 do_sub3 (adj_reg, cmp_rem, reg_lms);
594 }
595
596 addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
597 addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
598 }
599 else
600 {
601 addr1 = src1_addr;
602 addr2 = src2_addr;
603 }
604
605 do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
606 do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
607
608 if (TARGET_P9_MISC)
609 {
610 /* Generate a compare, and convert with a setb later. */
611 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
612 emit_insn (gen_rtx_SET (dcond, cmp));
613 }
614 else
615 {
616 if (word_mode == DImode)
617 emit_insn (gen_subfdi3_carry (diff, d2, d1));
618 else
619 emit_insn (gen_subfsi3_carry (diff, d2, d1));
620 }
621}
622
37ae4739
AS
623/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
624 instructions.
625
626 BYTES_TO_COMPARE is the number of bytes to be compared.
627 ORIG_SRC1 is the unmodified rtx for the first string.
628 ORIG_SRC2 is the unmodified rtx for the second string.
629 S1ADDR is the register to use for the base address of the first string.
630 S2ADDR is the register to use for the base address of the second string.
631 OFF_REG is the register to use for the string offset for loads.
632 S1DATA is the register for loading the first string.
633 S2DATA is the register for loading the second string.
634 VEC_RESULT is the rtx for the vector result indicating the byte difference.
635 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
636 to strcmp/strncmp if we have equality at the end of the inline comparison.
637 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
638 to clean up and generate the final comparison result.
639 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
640 set the final result.
641 CHECKZERO indicates whether the sequence should check for zero bytes
642 for use doing strncmp, or not (for use doing memcmp). */
643static void
644expand_cmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
645 rtx orig_src1, rtx orig_src2,
646 rtx s1addr, rtx s2addr, rtx off_reg,
647 rtx s1data, rtx s2data, rtx vec_result,
648 bool equality_compare_rest, rtx *p_cleanup_label,
649 rtx final_move_label, bool checkzero)
650{
651 machine_mode load_mode;
652 unsigned int load_mode_size;
653 unsigned HOST_WIDE_INT cmp_bytes = 0;
654 unsigned HOST_WIDE_INT offset = 0;
655 rtx zero_reg = NULL;
656
657 gcc_assert (p_cleanup_label != NULL);
658 rtx cleanup_label = *p_cleanup_label;
659
660 emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0)));
661 emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0)));
662
663 if (checkzero && !TARGET_P9_VECTOR)
664 {
665 zero_reg = gen_reg_rtx (V16QImode);
666 emit_move_insn (zero_reg, CONST0_RTX (V16QImode));
667 }
668
669 while (bytes_to_compare > 0)
670 {
671 /* VEC/VSX compare sequence for P8:
672 check each 16B with:
673 lxvd2x 32,28,8
674 lxvd2x 33,29,8
675 vcmpequb 2,0,1 # compare strings
676 vcmpequb 4,0,3 # compare w/ 0
677 xxlorc 37,36,34 # first FF byte is either mismatch or end of string
678 vcmpequb. 7,5,3 # reg 7 contains 0
679 bnl 6,.Lmismatch
680
681 For the P8 LE case, we use lxvd2x and compare full 16 bytes
700d4cb0 682 but then use vgbbd and a shift to get two bytes with the
37ae4739
AS
683 information we need in the correct order.
684
685 VEC/VSX compare sequence if TARGET_P9_VECTOR:
686 lxvb16x/lxvb16x # load 16B of each string
687 vcmpnezb. # produces difference location or zero byte location
688 bne 6,.Lmismatch
689
690 Use the overlapping compare trick for the last block if it is
691 less than 16 bytes.
692 */
693
694 load_mode = V16QImode;
695 load_mode_size = GET_MODE_SIZE (load_mode);
696
697 if (bytes_to_compare >= load_mode_size)
698 cmp_bytes = load_mode_size;
699 else
700 {
701 /* Move this load back so it doesn't go past the end. P8/P9
702 can do this efficiently. This is never called with less
703 than 16 bytes so we should always be able to do this. */
704 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
705 cmp_bytes = bytes_to_compare;
706 gcc_assert (offset > extra_bytes);
707 offset -= extra_bytes;
708 cmp_bytes = load_mode_size;
709 bytes_to_compare = cmp_bytes;
710 }
711
712 /* The offset currently used is always kept in off_reg so that the
713 cleanup code on P8 can use it to extract the differing byte. */
714 emit_move_insn (off_reg, GEN_INT (offset));
715
716 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
717 do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1);
718 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
719 do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2);
720
721 /* Cases to handle. A and B are chunks of the two strings.
722 1: Not end of comparison:
723 A != B: branch to cleanup code to compute result.
724 A == B: next block
725 2: End of the inline comparison:
726 A != B: branch to cleanup code to compute result.
727 A == B: call strcmp/strncmp
728 3: compared requested N bytes:
729 A == B: branch to result 0.
730 A != B: cleanup code to compute result. */
731
732 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
733
734 if (checkzero)
735 {
736 if (TARGET_P9_VECTOR)
737 emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data));
738 else
739 {
740 /* Emit instructions to do comparison and zero check. */
741 rtx cmp_res = gen_reg_rtx (load_mode);
742 rtx cmp_zero = gen_reg_rtx (load_mode);
743 rtx cmp_combined = gen_reg_rtx (load_mode);
744 emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data));
745 emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg));
746 emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res));
747 emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg));
748 }
749 }
750 else
751 emit_insn (gen_altivec_vcmpequb_p (vec_result, s1data, s2data));
752
753 bool branch_to_cleanup = (remain > 0 || equality_compare_rest);
754 rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO);
755 rtx dst_label;
756 rtx cmp_rtx;
757 if (branch_to_cleanup)
758 {
759 /* Branch to cleanup code, otherwise fall through to do more
760 compares. P8 and P9 use different CR bits because on P8
761 we are looking at the result of a comparsion vs a
762 register of zeroes so the all-true condition means no
763 difference or zero was found. On P9, vcmpnezb sets a byte
764 to 0xff if there is a mismatch or zero, so the all-false
765 condition indicates we found no difference or zero. */
766 if (!cleanup_label)
767 cleanup_label = gen_label_rtx ();
768 dst_label = cleanup_label;
769 if (TARGET_P9_VECTOR && checkzero)
770 cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx);
771 else
772 cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx);
773 }
774 else
775 {
776 /* Branch to final return or fall through to cleanup,
777 result is already set to 0. */
778 dst_label = final_move_label;
779 if (TARGET_P9_VECTOR && checkzero)
780 cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx);
781 else
782 cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx);
783 }
784
785 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
786 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
787 lab_ref, pc_rtx);
faaeebd6
AS
788 rtx_insn *j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
789 add_reg_br_prob_note (j2, profile_probability::likely ());
37ae4739
AS
790 JUMP_LABEL (j2) = dst_label;
791 LABEL_NUSES (dst_label) += 1;
792
793 offset += cmp_bytes;
794 bytes_to_compare -= cmp_bytes;
795 }
796 *p_cleanup_label = cleanup_label;
797 return;
798}
799
800/* Generate the final sequence that identifies the differing
801 byte and generates the final result, taking into account
802 zero bytes:
803
804 P8:
805 vgbbd 0,0
806 vsldoi 0,0,0,9
807 mfvsrd 9,32
808 addi 10,9,-1 # count trailing zero bits
809 andc 9,10,9
810 popcntd 9,9
811 lbzx 10,28,9 # use that offset to load differing byte
812 lbzx 3,29,9
813 subf 3,3,10 # subtract for final result
814
815 P9:
816 vclzlsbb # counts trailing bytes with lsb=0
817 vextublx # extract differing byte
818
819 STR1 is the reg rtx for data from string 1.
820 STR2 is the reg rtx for data from string 2.
821 RESULT is the reg rtx for the comparison result.
822 S1ADDR is the register to use for the base address of the first string.
823 S2ADDR is the register to use for the base address of the second string.
824 ORIG_SRC1 is the unmodified rtx for the first string.
825 ORIG_SRC2 is the unmodified rtx for the second string.
826 OFF_REG is the register to use for the string offset for loads.
827 VEC_RESULT is the rtx for the vector result indicating the byte difference. */
828
829static void
830emit_final_compare_vec (rtx str1, rtx str2, rtx result,
831 rtx s1addr, rtx s2addr,
832 rtx orig_src1, rtx orig_src2,
833 rtx off_reg, rtx vec_result)
834{
835
836 if (TARGET_P9_VECTOR)
837 {
838 rtx diffix = gen_reg_rtx (SImode);
839 rtx chr1 = gen_reg_rtx (SImode);
840 rtx chr2 = gen_reg_rtx (SImode);
841 rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0);
842 rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0);
843 emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result));
844 emit_insn (gen_vextublx (chr1, diffix, str1));
845 emit_insn (gen_vextublx (chr2, diffix, str2));
846 do_sub3 (result, chr1_di, chr2_di);
847 }
848 else
849 {
850 gcc_assert (TARGET_P8_VECTOR);
851 rtx diffix = gen_reg_rtx (DImode);
852 rtx result_gbbd = gen_reg_rtx (V16QImode);
853 /* Since each byte of the input is either 00 or FF, the bytes in
854 dw0 and dw1 after vgbbd are all identical to each other. */
855 emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result));
856 /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
857 For BE, we shift by 7 and get AB in the high two bytes then CLZ. */
858 rtx result_shifted = gen_reg_rtx (V16QImode);
859 int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9;
860 emit_insn (gen_altivec_vsldoi_v16qi (result_shifted, result_gbbd,
861 result_gbbd, GEN_INT (shift_amt)));
862
863 rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0);
864 emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted));
865 rtx count = gen_reg_rtx (DImode);
866
867 if (BYTES_BIG_ENDIAN)
868 emit_insn (gen_clzdi2 (count, diffix));
869 else
870 emit_insn (gen_ctzdi2 (count, diffix));
871
872 /* P8 doesn't have a good solution for extracting one byte from
873 a vsx reg like vextublx on P9 so we just compute the offset
874 of the differing byte and load it from each string. */
875 do_add3 (off_reg, off_reg, count);
876
877 rtx chr1 = gen_reg_rtx (QImode);
878 rtx chr2 = gen_reg_rtx (QImode);
879 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
880 do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1);
881 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
882 do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2);
883 machine_mode rmode = GET_MODE (result);
884 rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0);
885 rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0);
886 do_sub3 (result, chr1_rm, chr2_rm);
887 }
888
889 return;
890}
891
5ec3397e
AS
892/* Expand a block compare operation using loop code, and return true
893 if successful. Return false if we should let the compiler generate
894 normal code, probably a memcmp call.
895
896 OPERANDS[0] is the target (result).
897 OPERANDS[1] is the first source.
898 OPERANDS[2] is the second source.
899 OPERANDS[3] is the length.
900 OPERANDS[4] is the alignment. */
901bool
902expand_compare_loop (rtx operands[])
903{
904 rtx target = operands[0];
905 rtx orig_src1 = operands[1];
906 rtx orig_src2 = operands[2];
907 rtx bytes_rtx = operands[3];
908 rtx align_rtx = operands[4];
909
910 /* This case is complicated to handle because the subtract
911 with carry instructions do not generate the 64-bit
912 carry and so we must emit code to calculate it ourselves.
913 We choose not to implement this yet. */
914 if (TARGET_32BIT && TARGET_POWERPC64)
915 return false;
916
917 /* Allow non-const length. */
918 int bytes_is_const = CONST_INT_P (bytes_rtx);
919
920 /* This must be a fixed size alignment. */
921 if (!CONST_INT_P (align_rtx))
922 return false;
923
924 HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
925 HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
926 HOST_WIDE_INT minalign = MIN (align1, align2);
927
928 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
929
930 gcc_assert (GET_MODE (target) == SImode);
931
932 /* Anything to move? */
933 HOST_WIDE_INT bytes = 0;
934 if (bytes_is_const)
935 bytes = INTVAL (bytes_rtx);
936
937 if (bytes_is_const && bytes == 0)
938 return true;
939
940 /* Limit the amount we compare, if known statically. */
941 HOST_WIDE_INT max_bytes;
942 switch (rs6000_tune)
943 {
944 case PROCESSOR_POWER7:
945 if (!bytes_is_const)
946 if (minalign < 8)
947 max_bytes = 0;
948 else
949 max_bytes = 128;
950 else
951 if (minalign < 8)
952 max_bytes = 32;
953 else
954 max_bytes = 128;
955 break;
956 case PROCESSOR_POWER8:
957 if (!bytes_is_const)
958 max_bytes = 0;
959 else
960 if (minalign < 8)
961 max_bytes = 128;
962 else
963 max_bytes = 64;
964 break;
965 case PROCESSOR_POWER9:
5d9d0c94 966 case PROCESSOR_POWER10:
5ec3397e
AS
967 if (bytes_is_const)
968 max_bytes = 191;
969 else
970 max_bytes = 0;
971 break;
972 default:
973 max_bytes = 128;
974 }
975
976 /* Allow the option to override the default. */
977 if (rs6000_block_compare_inline_loop_limit >= 0)
978 max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
979
980 if (max_bytes == 0)
981 return false;
982
983 rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */
984 rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */
985 HOST_WIDE_INT niter;
986 rtx iter = gen_reg_rtx (word_mode);
987 rtx iv1 = gen_reg_rtx (word_mode);
988 rtx iv2 = gen_reg_rtx (word_mode);
989 rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */
990 rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */
991 rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */
992 rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */
993
994 /* Strip unneeded subreg from length if there is one. */
995 if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
996 bytes_rtx = SUBREG_REG (bytes_rtx);
997 /* Extend bytes_rtx to word_mode if needed. But, we expect only to
998 maybe have to deal with the case were bytes_rtx is SImode and
999 word_mode is DImode. */
1000 if (!bytes_is_const)
1001 {
1002 if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
1003 /* Do not expect length longer than word_mode. */
ef4adf1f 1004 return false;
5ec3397e
AS
1005 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
1006 {
1007 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
1008 bytes_rtx = force_reg (word_mode,
1009 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
1010 bytes_rtx));
1011 }
1012 else
1013 /* Make sure it's in a register before we get started. */
1014 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
1015 }
1016
1017 machine_mode load_mode = word_mode;
1018 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
1019
1020 /* Number of bytes per iteration of the unrolled loop. */
1021 HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
1022 /* max iters and bytes compared in the loop. */
1023 HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
1024 HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
1025 int l2lb = floor_log2 (loop_bytes);
1026
1027 if (bytes_is_const && (max_bytes < load_mode_size
1028 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
1029 return false;
1030
1031 bool no_remainder_code = false;
1032 rtx final_label = gen_label_rtx ();
1033 rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1034 rtx diff_label = gen_label_rtx ();
1035 rtx library_call_label = NULL;
1036 rtx cleanup_label = gen_label_rtx ();
1037
1038 rtx cr;
1039
1040 rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
1041 rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
1042
1043 /* Difference found is stored here before jump to diff_label. */
1044 rtx diff = gen_reg_rtx (word_mode);
faaeebd6 1045 rtx_insn *j;
5ec3397e
AS
1046
1047 /* Example of generated code for 35 bytes aligned 1 byte.
ef4adf1f 1048
5ec3397e
AS
1049 mtctr 8
1050 li 6,0
1051 li 5,8
1052 .L13:
1053 ldbrx 7,3,6
1054 ldbrx 9,10,6
1055 ldbrx 0,3,5
1056 ldbrx 4,10,5
1057 addi 6,6,16
1058 addi 5,5,16
1059 subfc. 9,9,7
1060 bne 0,.L10
1061 subfc. 9,4,0
1062 bdnzt 2,.L13
1063 bne 0,.L10
1064 add 3,3,6
1065 add 10,10,6
1066 addi 9,3,-5
1067 ldbrx 7,0,9
1068 addi 9,10,-5
1069 ldbrx 9,0,9
1070 subfc 9,9,7
1071 .p2align 4,,15
1072 .L10:
1073 popcntd 9,9
1074 subfe 10,10,10
1075 or 9,9,10
ef4adf1f 1076
5ec3397e
AS
1077 Compiled with -fno-reorder-blocks for clarity. */
1078
1079 /* Structure of what we're going to do:
1080 Two separate lengths: what we will compare before bailing to library
1081 call (max_bytes), and the total length to be checked.
1082 if length <= 16, branch to linear cleanup code starting with
1083 remainder length check (length not known at compile time)
1084 set up 2 iv's and load count reg, compute remainder length
1085 unrollx2 compare loop
1086 if loop exit due to a difference, branch to difference handling code
1087 if remainder length < 8, branch to final cleanup compare
1088 load and compare 8B
1089 final cleanup comparison (depends on alignment and length)
1090 load 8B, shift off bytes past length, compare
1091 load 8B ending at last byte and compare
1092 load/compare 1 byte at a time (short block abutting 4k boundary)
1093 difference handling, 64->32 conversion
1094 final result
1095 branch around memcmp call
1096 memcmp library call
1097 */
1098
1099 /* If bytes is not const, compare length and branch directly
1100 to the cleanup code that can handle 0-16 bytes if length
1101 is >= 16. Stash away bytes-max_bytes for the library call. */
1102 if (bytes_is_const)
1103 {
1104 /* These need to be set for some of the places we may jump to. */
1105 if (bytes > max_bytes)
1106 {
1107 no_remainder_code = true;
1108 niter = max_loop_iter;
1109 library_call_label = gen_label_rtx ();
1110 }
1111 else
1112 {
1113 niter = bytes / loop_bytes;
1114 }
1115 emit_move_insn (iter, GEN_INT (niter));
1116 emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
1117 emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
1118 }
1119 else
1120 {
1121 library_call_label = gen_label_rtx ();
1122
1123 /* If we go to the cleanup code, it expects length to be in cmp_rem. */
1124 emit_move_insn (cmp_rem, bytes_rtx);
1125
1126 /* Check for > max_bytes bytes. We want to bail out as quickly as
1127 possible if we have to go over to memcmp. */
1128 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
faaeebd6 1129 NULL_RTX, library_call_label, profile_probability::even ());
5ec3397e
AS
1130
1131 /* Check for < loop_bytes bytes. */
1132 do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
faaeebd6 1133 NULL_RTX, cleanup_label, profile_probability::even ());
5ec3397e
AS
1134
1135 /* Loop compare bytes and iterations if bytes>max_bytes. */
1136 rtx mb_reg = gen_reg_rtx (word_mode);
1137 emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
1138 rtx mi_reg = gen_reg_rtx (word_mode);
1139 emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
1140
1141 /* Compute number of loop iterations if bytes <= max_bytes. */
1142 if (word_mode == DImode)
1143 emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
1144 else
1145 emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
1146
1147 /* Compute bytes to compare in loop if bytes <= max_bytes. */
1148 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
1149 if (word_mode == DImode)
1150 {
1151 emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
1152 }
1153 else
1154 {
1155 emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
1156 }
1157
1158 /* Check for bytes <= max_bytes. */
1159 if (TARGET_ISEL)
1160 {
1161 /* P9 has fast isel so we use one compare and two isel. */
1162 cr = gen_reg_rtx (CCmode);
1163 rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
1164 GEN_INT (max_bytes));
1165 emit_move_insn (cr, compare_rtx);
1166 rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
1167 do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
1168 do_isel (iter, cmp_rtx, iter, mi_reg, cr);
1169 }
1170 else
1171 {
1172 rtx lab_after = gen_label_rtx ();
1173 do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
faaeebd6 1174 NULL_RTX, lab_after, profile_probability::even ());
5ec3397e
AS
1175 emit_move_insn (loop_cmp, mb_reg);
1176 emit_move_insn (iter, mi_reg);
1177 emit_label (lab_after);
1178 }
1179
1180 /* Now compute remainder bytes which isn't used until after the loop. */
1181 do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
1182 }
1183
1184 rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */
1185 /* For p9 we need to have just one of these as multiple places define
1186 it and it gets used by the setb at the end. */
1187 if (TARGET_P9_MISC)
1188 dcond = gen_reg_rtx (CCUNSmode);
1189
1190 if (!bytes_is_const || bytes >= loop_bytes)
1191 {
1192 /* It should not be possible to come here if remaining bytes is
1193 < 16 in the runtime case either. Compute number of loop
1194 iterations. We compare 2*word_mode per iteration so 16B for
1195 64-bit code and 8B for 32-bit. Set up two induction
1196 variables and load count register. */
1197
1198 /* HACK ALERT: create hard reg for CTR here. If we just use a
1199 pseudo, cse will get rid of it and then the allocator will
1200 see it used in the lshr above and won't give us ctr. */
1201 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1202 emit_move_insn (ctr, iter);
1203 emit_move_insn (diff, GEN_INT (0));
1204 emit_move_insn (iv1, GEN_INT (0));
1205 emit_move_insn (iv2, GEN_INT (load_mode_size));
1206
1207 /* inner loop to compare 2*word_mode */
1208 rtx loop_top_label = gen_label_rtx ();
1209 emit_label (loop_top_label);
1210
1211 rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
1212 rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
1213
1214 do_load_for_compare_from_addr (load_mode, d1_1,
1215 src1_ix1, orig_src1);
1216 do_load_for_compare_from_addr (load_mode, d2_1,
1217 src2_ix1, orig_src2);
1218 do_add3 (iv1, iv1, GEN_INT (loop_bytes));
1219
1220 rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
1221 rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
1222
1223 do_load_for_compare_from_addr (load_mode, d1_2,
1224 src1_ix2, orig_src1);
1225 do_load_for_compare_from_addr (load_mode, d2_2,
1226 src2_ix2, orig_src2);
1227 do_add3 (iv2, iv2, GEN_INT (loop_bytes));
1228
1229 if (TARGET_P9_MISC)
1230 {
1231 /* Generate a compare, and convert with a setb later. */
1232 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1233 emit_insn (gen_rtx_SET (dcond, cmp));
1234 }
1235 else
1236 {
1237 dcond = gen_reg_rtx (CCmode);
1238 if (word_mode == DImode)
1239 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1240 else
1241 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1242 }
1243
1244 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
faaeebd6 1245 dcond, diff_label, profile_probability::unlikely ());
5ec3397e
AS
1246
1247 if (TARGET_P9_MISC)
1248 {
1249 /* Generate a compare, and convert with a setb later. */
1250 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
1251 emit_insn (gen_rtx_SET (dcond, cmp));
1252 }
1253 else
1254 {
1255 dcond = gen_reg_rtx (CCmode);
1256 if (word_mode == DImode)
1257 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
1258 else
1259 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
1260 }
1261
1262 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
1263 if (TARGET_64BIT)
1264 j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
1265 eqrtx, dcond));
1266 else
1267 j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
1268 eqrtx, dcond));
faaeebd6 1269 add_reg_br_prob_note (j, profile_probability::likely ());
5ec3397e
AS
1270 JUMP_LABEL (j) = loop_top_label;
1271 LABEL_NUSES (loop_top_label) += 1;
1272 }
1273
1274 HOST_WIDE_INT bytes_remaining = 0;
1275 if (bytes_is_const)
1276 bytes_remaining = (bytes % loop_bytes);
1277
1278 /* If diff is nonzero, branch to difference handling
1279 code. If we exit here with a nonzero diff, it is
1280 because the second word differed. */
1281 if (TARGET_P9_MISC)
faaeebd6
AS
1282 do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond,
1283 diff_label, profile_probability::unlikely ());
5ec3397e 1284 else
faaeebd6
AS
1285 do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX,
1286 diff_label, profile_probability::unlikely ());
5ec3397e
AS
1287
1288 if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
1289 {
1290 /* If the length is known at compile time, then we will always
1291 have a remainder to go to the library call with. */
1292 rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
1293 j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
1294 JUMP_LABEL (j) = library_call_label;
1295 LABEL_NUSES (library_call_label) += 1;
1296 emit_barrier ();
1297 }
1298
1299 if (bytes_is_const && bytes_remaining == 0)
1300 {
1301 /* No remainder and if we are here then diff is 0 so just return 0 */
1302 if (TARGET_64BIT)
1303 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1304 else
1305 emit_move_insn (target, diff);
1306 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1307 JUMP_LABEL (j) = final_label;
1308 LABEL_NUSES (final_label) += 1;
1309 emit_barrier ();
1310 }
1311 else if (!no_remainder_code)
1312 {
1313 /* Update addresses to point to the next word to examine. */
1314 do_add3 (src1_addr, src1_addr, iv1);
1315 do_add3 (src2_addr, src2_addr, iv1);
1316
1317 emit_label (cleanup_label);
1318
1319 if (!bytes_is_const)
1320 {
1321 /* If we're dealing with runtime length, we have to check if
ef4adf1f 1322 it's zero after the loop. When length is known at compile
5ec3397e
AS
1323 time the no-remainder condition is dealt with above. By
1324 doing this after cleanup_label, we also deal with the
1325 case where length is 0 at the start and we bypass the
1326 loop with a branch to cleanup_label. */
1327 emit_move_insn (target, const0_rtx);
1328 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
faaeebd6 1329 NULL_RTX, final_label, profile_probability::unlikely ());
5ec3397e
AS
1330 }
1331
1332 rtx final_cleanup = gen_label_rtx ();
1333 rtx cmp_rem_before = gen_reg_rtx (word_mode);
1334 /* Compare one more word_mode chunk if needed. */
37ca383f 1335 if (!bytes_is_const || bytes_remaining >= load_mode_size)
5ec3397e
AS
1336 {
1337 /* If remainder length < word length, branch to final
1338 cleanup compare. */
faaeebd6 1339
5ec3397e 1340 if (!bytes_is_const)
faaeebd6
AS
1341 {
1342 do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
1343 NULL_RTX, final_cleanup, profile_probability::even ());
1344 }
5ec3397e
AS
1345
1346 /* load and compare 8B */
1347 do_load_for_compare_from_addr (load_mode, d1_1,
1348 src1_addr, orig_src1);
1349 do_load_for_compare_from_addr (load_mode, d2_1,
1350 src2_addr, orig_src2);
1351
1352 /* Compare the word, see if we need to do the last partial. */
1353 if (TARGET_P9_MISC)
1354 {
1355 /* Generate a compare, and convert with a setb later. */
1356 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1357 emit_insn (gen_rtx_SET (dcond, cmp));
1358 }
1359 else
1360 {
1361 dcond = gen_reg_rtx (CCmode);
1362 if (word_mode == DImode)
1363 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1364 else
1365 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1366 }
1367
1368 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
faaeebd6 1369 dcond, diff_label, profile_probability::even ());
5ec3397e
AS
1370
1371 do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1372 do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1373 emit_move_insn (cmp_rem_before, cmp_rem);
1374 do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1375 if (bytes_is_const)
1376 bytes_remaining -= load_mode_size;
1377 else
1378 /* See if remaining length is now zero. We previously set
1379 target to 0 so we can just jump to the end. */
faaeebd6
AS
1380 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX,
1381 final_label, profile_probability::unlikely ());
5ec3397e
AS
1382 }
1383
1384 /* Cases:
1385 bytes_is_const
1386 We can always shift back to do an overlapping compare
1387 of the last chunk because we know length >= 8.
1388
1389 !bytes_is_const
1390 align>=load_mode_size
1391 Read word_mode and mask
1392 align<load_mode_size
1393 avoid stepping past end
1394
1395 Three strategies:
1396 * decrement address and do overlapping compare
1397 * read word_mode and mask
1398 * carefully avoid crossing 4k boundary
1399 */
1400
1401 if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1402 && align1 >= load_mode_size && align2 >= load_mode_size)
1403 {
1404 /* Alignment is larger than word_mode so we do not need to be
1405 concerned with extra page crossings. But, we do not know
1406 that the length is larger than load_mode_size so we might
1407 end up compareing against data before the block if we try
1408 an overlapping compare. Also we use this on P7 for fixed length
1409 remainder because P7 doesn't like overlapping unaligned.
1410 Strategy: load 8B, shift off bytes past length, and compare. */
1411 emit_label (final_cleanup);
1412 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1413 src1_addr, src2_addr, orig_src1, orig_src2);
1414 }
1415 else if (bytes_remaining && bytes_is_const)
1416 {
1417 /* We do not do loop expand if length < 32 so we know at the
1418 end we can do an overlapping compare.
1419 Strategy: shift address back and do word_mode load that
1420 ends at the end of the block. */
1421 emit_label (final_cleanup);
1422 do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1423 cmp_rem, dcond, src1_addr, src2_addr,
1424 orig_src1, orig_src2);
1425 }
1426 else if (!bytes_is_const)
1427 {
1428 rtx handle4k_label = gen_label_rtx ();
1429 rtx nonconst_overlap = gen_label_rtx ();
1430 emit_label (nonconst_overlap);
1431
1432 /* Here we have to handle the case where whe have runtime
1433 length which may be too short for overlap compare, and
1434 alignment is not at least load_mode_size so we have to
1435 tread carefully to avoid stepping across 4k boundaries. */
1436
1437 /* If the length after the loop was larger than word_mode
1438 size, we can just do an overlapping compare and we're
1439 done. We fall through to this code from the word_mode
1440 compare that preceeds this. */
1441 do_overlap_load_compare (load_mode, false, 0, diff,
1442 cmp_rem, dcond, src1_addr, src2_addr,
1443 orig_src1, orig_src2);
1444
1445 rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1446 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1447 JUMP_LABEL (j) = diff_label;
1448 LABEL_NUSES (diff_label) += 1;
1449 emit_barrier ();
1450
1451 /* If we couldn't do the overlap compare we have to be more
1452 careful of the 4k boundary. Test to see if either
1453 address is less than word_mode_size away from a 4k
1454 boundary. If not, then we can do a load/shift/compare
1455 and we are done. We come to this code if length was less
1456 than word_mode_size. */
1457
1458 emit_label (final_cleanup);
1459
1460 /* We can still avoid the slow case if the length was larger
1461 than one loop iteration, in which case go do the overlap
1462 load compare path. */
1463 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
faaeebd6 1464 NULL_RTX, nonconst_overlap, profile_probability::even ());
5ec3397e
AS
1465
1466 rtx rem4k = gen_reg_rtx (word_mode);
1467 rtx dist1 = gen_reg_rtx (word_mode);
1468 rtx dist2 = gen_reg_rtx (word_mode);
1469 do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1470 if (word_mode == SImode)
1471 emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1472 else
1473 emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
faaeebd6
AS
1474 do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX,
1475 handle4k_label, profile_probability::very_unlikely ());
5ec3397e
AS
1476 if (word_mode == SImode)
1477 emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1478 else
1479 emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
faaeebd6
AS
1480 do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX,
1481 handle4k_label, profile_probability::very_unlikely ());
5ec3397e
AS
1482
1483 /* We don't have a 4k boundary to deal with, so do
1484 a load/shift/compare and jump to diff. */
1485
1486 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1487 src1_addr, src2_addr, orig_src1, orig_src2);
1488
1489 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1490 JUMP_LABEL (j) = diff_label;
1491 LABEL_NUSES (diff_label) += 1;
1492 emit_barrier ();
1493
1494 /* Finally in the unlikely case we are inching up to a
1495 4k boundary we use a compact lbzx/compare loop to do
1496 it a byte at a time. */
1497
1498 emit_label (handle4k_label);
1499
1500 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1501 emit_move_insn (ctr, cmp_rem);
1502 rtx ixreg = gen_reg_rtx (Pmode);
1503 emit_move_insn (ixreg, const0_rtx);
1504
1505 rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1506 rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1507 rtx d1 = gen_reg_rtx (word_mode);
1508 rtx d2 = gen_reg_rtx (word_mode);
1509
1510 rtx fc_loop = gen_label_rtx ();
1511 emit_label (fc_loop);
1512
1513 do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1514 do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1515
1516 do_add3 (ixreg, ixreg, const1_rtx);
1517
1518 rtx cond = gen_reg_rtx (CCmode);
1519 rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1520 rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1521
1522 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1523 if (TARGET_64BIT)
1524 j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1525 eqrtx, cond));
1526 else
1527 j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1528 eqrtx, cond));
5585759f 1529 add_reg_br_prob_note (j, profile_probability::likely ());
5ec3397e
AS
1530 JUMP_LABEL (j) = fc_loop;
1531 LABEL_NUSES (fc_loop) += 1;
1532
1533 if (TARGET_64BIT)
1534 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1535 else
1536 emit_move_insn (target, diff);
1537
1538 /* Since we are comparing bytes, the difference can be used
1539 as the final result and we are done here. */
1540 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1541 JUMP_LABEL (j) = final_label;
1542 LABEL_NUSES (final_label) += 1;
1543 emit_barrier ();
1544 }
1545 }
1546
1547 emit_label (diff_label);
1548 /* difference handling, 64->32 conversion */
1549
1550 /* We need to produce DI result from sub, then convert to target SI
1551 while maintaining <0 / ==0 / >0 properties. This sequence works:
1552 subfc L,A,B
1553 subfe H,H,H
1554 popcntd L,L
1555 rldimi L,H,6,0
1556
1557 This is an alternate one Segher cooked up if somebody
1558 wants to expand this for something that doesn't have popcntd:
1559 subfc L,a,b
1560 subfe H,x,x
1561 addic t,L,-1
1562 subfe v,t,L
1563 or z,v,H
1564
1565 And finally, p9 can just do this:
1566 cmpld A,B
1567 setb r */
1568
1569 if (TARGET_P9_MISC)
1570 emit_insn (gen_setb_unsigned (target, dcond));
1571 else
1572 {
1573 if (TARGET_64BIT)
1574 {
1575 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1576 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1577 emit_insn (gen_popcntddi2 (diff, diff));
1578 emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1579 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1580 }
1581 else
1582 {
1583 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1584 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1585 emit_insn (gen_popcntdsi2 (diff, diff));
1586 emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1587 }
1588 }
1589
1590 if (library_call_label != NULL)
1591 {
1592 /* Branch around memcmp call. */
1593 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1594 JUMP_LABEL (j) = final_label;
1595 LABEL_NUSES (final_label) += 1;
1596 emit_barrier ();
1597
1598 /* Make memcmp library call. cmp_rem is the remaining bytes that
1599 were compared and cmp_rem is the expected amount to be compared
1600 by memcmp. If we don't find a difference in the loop compare, do
1601 the library call directly instead of doing a small compare just
1602 to get to an arbitrary boundary before calling it anyway.
1603 Also, update addresses to point to the next word to examine. */
1604 emit_label (library_call_label);
1605
1606 rtx len_rtx = gen_reg_rtx (word_mode);
1607 if (bytes_is_const)
1608 {
1609 emit_move_insn (len_rtx, cmp_rem);
1610 do_add3 (src1_addr, src1_addr, iv1);
1611 do_add3 (src2_addr, src2_addr, iv1);
1612 }
1613 else
1614 emit_move_insn (len_rtx, bytes_rtx);
1615
1616 tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1617 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1618 target, LCT_NORMAL, GET_MODE (target),
1619 src1_addr, Pmode,
1620 src2_addr, Pmode,
1621 len_rtx, GET_MODE (len_rtx));
1622 }
1623
1624 /* emit final_label */
1625 emit_label (final_label);
1626 return true;
1627}
1628
37ae4739
AS
1629/* Generate code to convert a DImode-plus-carry subtract result into
1630 a SImode result that has the same <0 / ==0 / >0 properties to
1631 produce the final result from memcmp.
8845cb37 1632
37ae4739
AS
1633 TARGET is the rtx for the register to receive the memcmp result.
1634 SUB_RESULT is the rtx for the register contining the subtract result. */
8845cb37 1635
37ae4739
AS
1636void
1637generate_6432_conversion(rtx target, rtx sub_result)
1638{
1639 /* We need to produce DI result from sub, then convert to target SI
1640 while maintaining <0 / ==0 / >0 properties. This sequence works:
1641 subfc L,A,B
1642 subfe H,H,H
1643 popcntd L,L
1644 rldimi L,H,6,0
8845cb37 1645
37ae4739
AS
1646 This is an alternate one Segher cooked up if somebody
1647 wants to expand this for something that doesn't have popcntd:
1648 subfc L,a,b
1649 subfe H,x,x
1650 addic t,L,-1
1651 subfe v,t,L
1652 or z,v,H
8845cb37 1653
37ae4739
AS
1654 And finally, p9 can just do this:
1655 cmpld A,B
1656 setb r */
8845cb37 1657
37ae4739
AS
1658 if (TARGET_64BIT)
1659 {
1660 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1661 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1662 rtx popcnt = gen_reg_rtx (DImode);
1663 emit_insn (gen_popcntddi2 (popcnt, sub_result));
1664 rtx tmp2 = gen_reg_rtx (DImode);
1665 emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca));
1666 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2)));
1667 }
8845cb37 1668 else
37ae4739
AS
1669 {
1670 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1671 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1672 rtx popcnt = gen_reg_rtx (SImode);
1673 emit_insn (gen_popcntdsi2 (popcnt, sub_result));
1674 emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca));
1675 }
1676}
8845cb37 1677
37ae4739
AS
1678/* Generate memcmp expansion using in-line non-loop GPR instructions.
1679 The bool return indicates whether code for a 64->32 conversion
1680 should be generated.
1681
1682 BYTES is the number of bytes to be compared.
1683 BASE_ALIGN is the minimum alignment for both blocks to compare.
1684 ORIG_SRC1 is the original pointer to the first block to compare.
1685 ORIG_SRC2 is the original pointer to the second block to compare.
1686 SUB_RESULT is the reg rtx for the result from the final subtract.
1687 COND is rtx for a condition register that will be used for the final
1688 compare on power9 or better.
1689 FINAL_RESULT is the reg rtx for the final memcmp result.
1690 P_CONVERT_LABEL is a pointer to rtx that will be used to store the
1691 label generated for a branch to the 64->32 code, if such a branch
1692 is needed.
1693 P_FINAL_LABEL is a pointer to rtx that will be used to store the label
1694 for the end of the memcmp if a branch there is needed.
1695*/
8845cb37 1696
37ae4739
AS
1697bool
1698expand_block_compare_gpr(unsigned HOST_WIDE_INT bytes, unsigned int base_align,
1699 rtx orig_src1, rtx orig_src2,
1700 rtx sub_result, rtx cond, rtx final_result,
1701 rtx *p_convert_label, rtx *p_final_label)
1702{
8845cb37
AS
1703 /* Example of generated code for 18 bytes aligned 1 byte.
1704 Compiled with -fno-reorder-blocks for clarity.
1705 ldbrx 10,31,8
1706 ldbrx 9,7,8
1707 subfc. 9,9,10
1708 bne 0,.L6487
1709 addi 9,12,8
1710 addi 5,11,8
1711 ldbrx 10,0,9
1712 ldbrx 9,0,5
1713 subfc. 9,9,10
1714 bne 0,.L6487
1715 addi 9,12,16
1716 lhbrx 10,0,9
1717 addi 9,11,16
1718 lhbrx 9,0,9
1719 subf 9,9,10
1720 b .L6488
1721 .p2align 4,,15
1722 .L6487: #convert_label
1723 popcntd 9,9
1724 subfe 10,10,10
1725 or 9,9,10
1726 .L6488: #final_label
1727 extsw 10,9
1728
1729 We start off with DImode for two blocks that jump to the DI->SI conversion
1730 if the difference is found there, then a final block of HImode that skips
1731 the DI->SI conversion. */
1732
37ae4739
AS
1733 unsigned HOST_WIDE_INT offset = 0;
1734 unsigned int load_mode_size;
1735 HOST_WIDE_INT cmp_bytes = 0;
1736 rtx src1 = orig_src1;
1737 rtx src2 = orig_src2;
1738 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1739 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1740 bool need_6432_conv = false;
1741 rtx convert_label = NULL;
1742 rtx final_label = NULL;
1743 machine_mode load_mode;
1744
8845cb37
AS
1745 while (bytes > 0)
1746 {
1747 unsigned int align = compute_current_alignment (base_align, offset);
74f9986e 1748 load_mode = select_block_compare_mode (offset, bytes, align);
8845cb37
AS
1749 load_mode_size = GET_MODE_SIZE (load_mode);
1750 if (bytes >= load_mode_size)
1751 cmp_bytes = load_mode_size;
78bd9e25
HG
1752 else if (!targetm.slow_unaligned_access (load_mode,
1753 align * BITS_PER_UNIT))
8845cb37
AS
1754 {
1755 /* Move this load back so it doesn't go past the end.
1756 P8/P9 can do this efficiently. */
1757 unsigned int extra_bytes = load_mode_size - bytes;
1758 cmp_bytes = bytes;
1759 if (extra_bytes < offset)
1760 {
1761 offset -= extra_bytes;
1762 cmp_bytes = load_mode_size;
1763 bytes = cmp_bytes;
1764 }
1765 }
1766 else
1767 /* P7 and earlier can't do the overlapping load trick fast,
1768 so this forces a non-overlapping load and a shift to get
1769 rid of the extra bytes. */
1770 cmp_bytes = bytes;
1771
1772 src1 = adjust_address (orig_src1, load_mode, offset);
1773 src2 = adjust_address (orig_src2, load_mode, offset);
1774
1775 if (!REG_P (XEXP (src1, 0)))
1776 {
1777 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1778 src1 = replace_equiv_address (src1, src1_reg);
1779 }
f4f867ca 1780 set_mem_size (src1, load_mode_size);
8845cb37
AS
1781
1782 if (!REG_P (XEXP (src2, 0)))
1783 {
1784 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1785 src2 = replace_equiv_address (src2, src2_reg);
1786 }
f4f867ca 1787 set_mem_size (src2, load_mode_size);
8845cb37
AS
1788
1789 do_load_for_compare (tmp_reg_src1, src1, load_mode);
1790 do_load_for_compare (tmp_reg_src2, src2, load_mode);
1791
1792 if (cmp_bytes < load_mode_size)
1793 {
1794 /* Shift unneeded bytes off. */
1795 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1796 if (word_mode == DImode)
1797 {
1798 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1799 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1800 }
1801 else
1802 {
1803 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1804 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1805 }
1806 }
1807
1808 int remain = bytes - cmp_bytes;
37ae4739 1809 if (GET_MODE_SIZE (GET_MODE (final_result)) > GET_MODE_SIZE (load_mode))
8845cb37 1810 {
37ae4739 1811 /* Final_result is larger than load size so we don't need to
8845cb37
AS
1812 reduce result size. */
1813
1814 /* We previously did a block that need 64->32 conversion but
1815 the current block does not, so a label is needed to jump
1816 to the end. */
37ae4739 1817 if (need_6432_conv && !final_label)
8845cb37
AS
1818 final_label = gen_label_rtx ();
1819
1820 if (remain > 0)
1821 {
1822 /* This is not the last block, branch to the end if the result
1823 of this subtract is not zero. */
1824 if (!final_label)
1825 final_label = gen_label_rtx ();
1826 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1827 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1828 rtx cr = gen_reg_rtx (CCmode);
1829 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
37ae4739 1830 emit_insn (gen_movsi (final_result,
8845cb37
AS
1831 gen_lowpart (SImode, tmp_reg_src2)));
1832 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1833 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1834 fin_ref, pc_rtx);
faaeebd6
AS
1835 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1836 add_reg_br_prob_note (j, profile_probability::unlikely ());
8845cb37
AS
1837 JUMP_LABEL (j) = final_label;
1838 LABEL_NUSES (final_label) += 1;
1839 }
1840 else
1841 {
1842 if (word_mode == DImode)
1843 {
1844 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1845 tmp_reg_src2));
37ae4739 1846 emit_insn (gen_movsi (final_result,
8845cb37
AS
1847 gen_lowpart (SImode, tmp_reg_src2)));
1848 }
1849 else
37ae4739 1850 emit_insn (gen_subsi3 (final_result, tmp_reg_src1, tmp_reg_src2));
8845cb37
AS
1851
1852 if (final_label)
1853 {
1854 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1855 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
5ec3397e 1856 JUMP_LABEL (j) = final_label;
8845cb37
AS
1857 LABEL_NUSES (final_label) += 1;
1858 emit_barrier ();
1859 }
1860 }
1861 }
1862 else
1863 {
1864 /* Do we need a 64->32 conversion block? We need the 64->32
37ae4739 1865 conversion even if final_result size == load_mode size because
8845cb37 1866 the subtract generates one extra bit. */
37ae4739 1867 need_6432_conv = true;
8845cb37
AS
1868
1869 if (remain > 0)
1870 {
1871 if (!convert_label)
1872 convert_label = gen_label_rtx ();
1873
1874 /* Compare to zero and branch to convert_label if not zero. */
1875 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1876 if (TARGET_P9_MISC)
1877 {
37ae4739
AS
1878 /* Generate a compare, and convert with a setb later.
1879 Use cond that is passed in because the caller needs
1880 to use it for the 64->32 conversion later. */
8845cb37
AS
1881 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1882 tmp_reg_src2);
1883 emit_insn (gen_rtx_SET (cond, cmp));
1884 }
1885 else
37ae4739
AS
1886 {
1887 /* Generate a subfc. and use the longer sequence for
1888 conversion. Cond is not used outside this
1889 function in this case. */
1890 cond = gen_reg_rtx (CCmode);
1891 if (TARGET_64BIT)
1892 emit_insn (gen_subfdi3_carry_dot2 (sub_result, tmp_reg_src2,
1893 tmp_reg_src1, cond));
1894 else
1895 emit_insn (gen_subfsi3_carry_dot2 (sub_result, tmp_reg_src2,
1896 tmp_reg_src1, cond));
1897 }
1898
8845cb37
AS
1899 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1900 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1901 cvt_ref, pc_rtx);
5585759f
AS
1902 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1903 add_reg_br_prob_note (j, profile_probability::likely ());
5ec3397e 1904 JUMP_LABEL (j) = convert_label;
8845cb37
AS
1905 LABEL_NUSES (convert_label) += 1;
1906 }
1907 else
1908 {
1909 /* Just do the subtract/compare. Since this is the last block
1910 the convert code will be generated immediately following. */
1911 if (TARGET_P9_MISC)
1912 {
1913 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1914 tmp_reg_src2);
1915 emit_insn (gen_rtx_SET (cond, cmp));
1916 }
1917 else
1918 if (TARGET_64BIT)
37ae4739 1919 emit_insn (gen_subfdi3_carry (sub_result, tmp_reg_src2,
8845cb37
AS
1920 tmp_reg_src1));
1921 else
37ae4739 1922 emit_insn (gen_subfsi3_carry (sub_result, tmp_reg_src2,
8845cb37
AS
1923 tmp_reg_src1));
1924 }
1925 }
1926
1927 offset += cmp_bytes;
1928 bytes -= cmp_bytes;
1929 }
1930
37ae4739
AS
1931 if (convert_label)
1932 *p_convert_label = convert_label;
1933 if (final_label)
1934 *p_final_label = final_label;
1935 return need_6432_conv;
1936}
1937
1938/* Expand a block compare operation, and return true if successful.
1939 Return false if we should let the compiler generate normal code,
1940 probably a memcmp call.
1941
1942 OPERANDS[0] is the target (result).
1943 OPERANDS[1] is the first source.
1944 OPERANDS[2] is the second source.
1945 OPERANDS[3] is the length.
1946 OPERANDS[4] is the alignment. */
1947bool
1948expand_block_compare (rtx operands[])
1949{
1950 rtx target = operands[0];
1951 rtx orig_src1 = operands[1];
1952 rtx orig_src2 = operands[2];
1953 rtx bytes_rtx = operands[3];
1954 rtx align_rtx = operands[4];
1955
1956 /* This case is complicated to handle because the subtract
1957 with carry instructions do not generate the 64-bit
1958 carry and so we must emit code to calculate it ourselves.
1959 We choose not to implement this yet. */
1960 if (TARGET_32BIT && TARGET_POWERPC64)
1961 return false;
1962
1963 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
1964
1965 /* Allow this param to shut off all expansion. */
1966 if (rs6000_block_compare_inline_limit == 0)
1967 return false;
1968
1969 /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1970 However slow_unaligned_access returns true on P7 even though the
1971 performance of this code is good there. */
1972 if (!isP7
1973 && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1974 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
1975 return false;
1976
1977 /* Unaligned l*brx traps on P7 so don't do this. However this should
1978 not affect much because LE isn't really supported on P7 anyway. */
1979 if (isP7 && !BYTES_BIG_ENDIAN)
1980 return false;
1981
1982 /* If this is not a fixed size compare, try generating loop code and
1983 if that fails just call memcmp. */
1984 if (!CONST_INT_P (bytes_rtx))
1985 return expand_compare_loop (operands);
1986
1987 /* This must be a fixed size alignment. */
1988 if (!CONST_INT_P (align_rtx))
1989 return false;
1990
78bd9e25
HG
1991 unsigned int align_by_bits = UINTVAL (align_rtx);
1992 unsigned int base_align = align_by_bits / BITS_PER_UNIT;
37ae4739
AS
1993
1994 gcc_assert (GET_MODE (target) == SImode);
1995
1996 /* Anything to move? */
1997 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1998 if (bytes == 0)
1999 return true;
2000
2001 /* P7/P8 code uses cond for subfc. but P9 uses
2002 it for cmpld which needs CCUNSmode. */
2003 rtx cond = NULL;
2004 if (TARGET_P9_MISC)
2005 cond = gen_reg_rtx (CCUNSmode);
2006
2007 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
2008 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2009 at least POWER8. That way we can rely on overlapping compares to
2010 do the final comparison of less than 16 bytes. Also I do not
2011 want to deal with making this work for 32 bits. In addition, we
2012 have to make sure that we have at least P8_VECTOR (we don't allow
2013 P9_VECTOR without P8_VECTOR). */
2014 int use_vec = (bytes >= 33 && !TARGET_32BIT
2015 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
2016
2017 /* We don't want to generate too much code. The loop code can take
2018 over for lengths greater than 31 bytes. */
2019 unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
2020
2021 /* Don't generate too much code if vsx was disabled. */
2022 if (!use_vec && max_bytes > 1)
2023 max_bytes = ((max_bytes + 1) / 2) - 1;
2024
2025 if (!IN_RANGE (bytes, 1, max_bytes))
2026 return expand_compare_loop (operands);
2027
2028 /* The code generated for p7 and older is not faster than glibc
2029 memcmp if alignment is small and length is not short, so bail
2030 out to avoid those conditions. */
78bd9e25 2031 if (targetm.slow_unaligned_access (word_mode, align_by_bits)
37ae4739
AS
2032 && ((base_align == 1 && bytes > 16)
2033 || (base_align == 2 && bytes > 32)))
2034 return false;
2035
2036 rtx final_label = NULL;
2037
2038 if (use_vec)
8845cb37 2039 {
37ae4739
AS
2040 rtx final_move_label = gen_label_rtx ();
2041 rtx s1addr = gen_reg_rtx (Pmode);
2042 rtx s2addr = gen_reg_rtx (Pmode);
2043 rtx off_reg = gen_reg_rtx (Pmode);
2044 rtx cleanup_label = NULL;
2045 rtx vec_result = gen_reg_rtx (V16QImode);
2046 rtx s1data = gen_reg_rtx (V16QImode);
2047 rtx s2data = gen_reg_rtx (V16QImode);
2048 rtx result_reg = gen_reg_rtx (word_mode);
2049 emit_move_insn (result_reg, GEN_INT (0));
8845cb37 2050
37ae4739
AS
2051 expand_cmp_vec_sequence (bytes, orig_src1, orig_src2,
2052 s1addr, s2addr, off_reg, s1data, s2data,
2053 vec_result, false,
2054 &cleanup_label, final_move_label, false);
2055
2056 if (cleanup_label)
2057 emit_label (cleanup_label);
2058
2059 emit_insn (gen_one_cmplv16qi2 (vec_result, vec_result));
2060
2061 emit_final_compare_vec (s1data, s2data, result_reg,
2062 s1addr, s2addr, orig_src1, orig_src2,
2063 off_reg, vec_result);
2064
2065 emit_label (final_move_label);
2066 emit_insn (gen_movsi (target,
2067 gen_lowpart (SImode, result_reg)));
2068 }
2069 else
2070 { /* generate GPR code */
2071
2072 rtx convert_label = NULL;
2073 rtx sub_result = gen_reg_rtx (word_mode);
2074 bool need_6432_conversion =
2075 expand_block_compare_gpr(bytes, base_align,
2076 orig_src1, orig_src2,
2077 sub_result, cond, target,
2078 &convert_label, &final_label);
2079
2080 if (need_6432_conversion)
8845cb37 2081 {
37ae4739
AS
2082 if (convert_label)
2083 emit_label (convert_label);
2084 if (TARGET_P9_MISC)
2085 emit_insn (gen_setb_unsigned (target, cond));
8845cb37 2086 else
37ae4739 2087 generate_6432_conversion(target, sub_result);
8845cb37
AS
2088 }
2089 }
2090
2091 if (final_label)
2092 emit_label (final_label);
2093
8845cb37
AS
2094 return true;
2095}
2096
f7e94dfb 2097/* Generate page crossing check and branch code to set up for
8845cb37
AS
2098 strncmp when we don't have DI alignment.
2099 STRNCMP_LABEL is the label to branch if there is a page crossing.
f7e94dfb 2100 SRC_ADDR is the string address to be examined.
8845cb37
AS
2101 BYTES is the max number of bytes to compare. */
2102static void
f7e94dfb 2103expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes)
8845cb37
AS
2104{
2105 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
f7e94dfb
AS
2106 rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr));
2107 do_and3 (src_pgoff, src_addr, GEN_INT (0xfff));
8845cb37 2108 rtx cond = gen_reg_rtx (CCmode);
f7e94dfb 2109 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff,
8845cb37
AS
2110 GEN_INT (4096 - bytes)));
2111
0c791c59 2112 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
8845cb37
AS
2113
2114 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
0c791c59 2115 lab_ref, pc_rtx);
faaeebd6
AS
2116 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2117 add_reg_br_prob_note (j, profile_probability::unlikely ());
8845cb37
AS
2118 JUMP_LABEL (j) = strncmp_label;
2119 LABEL_NUSES (strncmp_label) += 1;
2120}
2121
74f9986e
AS
2122/* Generate the sequence of compares for strcmp/strncmp using gpr instructions.
2123 BYTES_TO_COMPARE is the number of bytes to be compared.
2124 BASE_ALIGN is the smaller of the alignment of the two strings.
2125 ORIG_SRC1 is the unmodified rtx for the first string.
2126 ORIG_SRC2 is the unmodified rtx for the second string.
2127 TMP_REG_SRC1 is the register for loading the first string.
2128 TMP_REG_SRC2 is the register for loading the second string.
2129 RESULT_REG is the rtx for the result register.
2130 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
2131 to strcmp/strncmp if we have equality at the end of the inline comparison.
9d36bd3b
AS
2132 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
2133 to clean up and generate the final comparison result.
ef4adf1f 2134 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
74f9986e
AS
2135 set the final result. */
2136static void
9d36bd3b
AS
2137expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
2138 unsigned int base_align,
2139 rtx orig_src1, rtx orig_src2,
2140 rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
2141 bool equality_compare_rest, rtx *p_cleanup_label,
2142 rtx final_move_label)
74f9986e
AS
2143{
2144 unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
2145 machine_mode load_mode;
2146 unsigned int load_mode_size;
2147 unsigned HOST_WIDE_INT cmp_bytes = 0;
2148 unsigned HOST_WIDE_INT offset = 0;
2149 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2150 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
9d36bd3b
AS
2151 gcc_assert (p_cleanup_label != NULL);
2152 rtx cleanup_label = *p_cleanup_label;
74f9986e
AS
2153
2154 while (bytes_to_compare > 0)
2155 {
2156 /* GPR compare sequence:
ef4adf1f
AS
2157 check each 8B with: ld/ld/cmpb/cmpb/orc./bne
2158
74f9986e 2159 cleanup code at end:
74f9986e
AS
2160 cntlzd get bit of first zero/diff byte
2161 subfic convert for rldcl use
2162 rldcl rldcl extract diff/zero byte
2163 subf subtract for final result
2164
2165 The last compare can branch around the cleanup code if the
2166 result is zero because the strings are exactly equal. */
ef4adf1f 2167
74f9986e
AS
2168 unsigned int align = compute_current_alignment (base_align, offset);
2169 load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
2170 load_mode_size = GET_MODE_SIZE (load_mode);
2171 if (bytes_to_compare >= load_mode_size)
2172 cmp_bytes = load_mode_size;
78bd9e25
HG
2173 else if (!targetm.slow_unaligned_access (load_mode,
2174 align * BITS_PER_UNIT))
74f9986e
AS
2175 {
2176 /* Move this load back so it doesn't go past the end.
2177 P8/P9 can do this efficiently. */
2178 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
2179 cmp_bytes = bytes_to_compare;
2180 if (extra_bytes < offset)
2181 {
2182 offset -= extra_bytes;
2183 cmp_bytes = load_mode_size;
2184 bytes_to_compare = cmp_bytes;
2185 }
2186 }
2187 else
2188 /* P7 and earlier can't do the overlapping load trick fast,
2189 so this forces a non-overlapping load and a shift to get
2190 rid of the extra bytes. */
2191 cmp_bytes = bytes_to_compare;
2192
122d6c36
AS
2193 rtx offset_rtx;
2194 if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM)
2195 offset_rtx = GEN_INT (offset);
2196 else
2197 {
2198 offset_rtx = gen_reg_rtx (Pmode);
2199 emit_move_insn (offset_rtx, GEN_INT (offset));
2200 }
2201 rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx);
2202 rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx);
37ae4739 2203
74f9986e 2204 do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
74f9986e
AS
2205 do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
2206
2207 /* We must always left-align the data we read, and
2208 clear any bytes to the right that are beyond the string.
2209 Otherwise the cmpb sequence won't produce the correct
ef4adf1f
AS
2210 results. However if there is only one byte left, we
2211 can just subtract to get the final result so the shifts
2212 and clears are not needed. */
74f9986e 2213
ef4adf1f 2214 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
74f9986e 2215
ef4adf1f
AS
2216 /* Loading just a single byte is a special case. If we are
2217 loading more than that, we have to check whether we are
2218 looking at the entire chunk of data. If not, rotate left and
2219 clear right so that bytes we aren't supposed to look at are
2220 zeroed, and the first byte we are supposed to compare is
2221 leftmost. */
2222 if (load_mode_size != 1)
74f9986e 2223 {
ef4adf1f
AS
2224 if (load_mode_size < word_mode_size)
2225 {
2226 /* Rotate left first. */
2227 rtx sh = GEN_INT (BITS_PER_UNIT
2228 * (word_mode_size - load_mode_size));
2229 do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
2230 do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
2231 }
2232
2233 if (cmp_bytes < word_mode_size)
2234 {
2235 /* Now clear right. This plus the rotate can be
2236 turned into a rldicr instruction. */
2237 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2238 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2239 do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
2240 do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
2241 }
74f9986e
AS
2242 }
2243
2244 /* Cases to handle. A and B are chunks of the two strings.
2245 1: Not end of comparison:
2246 A != B: branch to cleanup code to compute result.
2247 A == B: check for 0 byte, next block if not found.
2248 2: End of the inline comparison:
2249 A != B: branch to cleanup code to compute result.
2250 A == B: check for 0 byte, call strcmp/strncmp
2251 3: compared requested N bytes:
2252 A == B: branch to result 0.
2253 A != B: cleanup code to compute result. */
2254
74f9986e
AS
2255 rtx dst_label;
2256 if (remain > 0 || equality_compare_rest)
2257 {
2258 /* Branch to cleanup code, otherwise fall through to do
2259 more compares. */
2260 if (!cleanup_label)
2261 cleanup_label = gen_label_rtx ();
2262 dst_label = cleanup_label;
2263 }
2264 else
2265 /* Branch to end and produce result of 0. */
2266 dst_label = final_move_label;
2267
ef4adf1f
AS
2268 if (load_mode_size == 1)
2269 {
2270 /* Special case for comparing just single byte. */
2271 if (equality_compare_rest)
2272 {
2273 /* Use subf./bne to branch to final_move_label if the
2274 byte differs, otherwise fall through to the strncmp
2275 call. We must also check for a zero byte here as we
2276 must not make the library call if this is the end of
2277 the string. */
2278
2279 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2280 rtx cond = gen_reg_rtx (CCmode);
2281 rtx diff_rtx = gen_rtx_MINUS (word_mode,
2282 tmp_reg_src1, tmp_reg_src2);
2283 rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond);
2284 rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2285
2286 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2287 lab_ref, pc_rtx);
faaeebd6
AS
2288 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2289 add_reg_br_prob_note (j, profile_probability::unlikely ());
ef4adf1f
AS
2290 JUMP_LABEL (j) = final_move_label;
2291 LABEL_NUSES (final_move_label) += 1;
74f9986e 2292
ef4adf1f
AS
2293 /* Check for zero byte here before fall through to
2294 library call. This catches the case where the
2295 strings are equal and end in a zero byte at this
2296 position. */
74f9986e 2297
ef4adf1f
AS
2298 rtx cond0 = gen_reg_rtx (CCmode);
2299 emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1,
2300 const0_rtx));
74f9986e 2301
ef4adf1f 2302 rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx);
74f9986e 2303
ef4adf1f
AS
2304 rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx,
2305 lab_ref, pc_rtx);
faaeebd6
AS
2306 rtx_insn *j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0));
2307 add_reg_br_prob_note (j0, profile_probability::unlikely ());
ef4adf1f
AS
2308 JUMP_LABEL (j0) = final_move_label;
2309 LABEL_NUSES (final_move_label) += 1;
2310 }
2311 else
2312 {
2313 /* This is the last byte to be compared so we can use
2314 subf to compute the final result and branch
2315 unconditionally to final_move_label. */
2316
2317 do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2);
2318
2319 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2320 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2321 JUMP_LABEL (j) = final_move_label;
2322 LABEL_NUSES (final_move_label) += 1;
2323 emit_barrier ();
2324 }
2325 }
2326 else
74f9986e 2327 {
74f9986e 2328 rtx cmpb_zero = gen_reg_rtx (word_mode);
ef4adf1f 2329 rtx cmpb_diff = gen_reg_rtx (word_mode);
74f9986e 2330 rtx zero_reg = gen_reg_rtx (word_mode);
ef4adf1f
AS
2331 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
2332 rtx cond = gen_reg_rtx (CCmode);
2333
74f9986e 2334 emit_move_insn (zero_reg, GEN_INT (0));
ef4adf1f 2335 do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2);
74f9986e 2336 do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
ef4adf1f
AS
2337 rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff);
2338 rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero);
74f9986e 2339
ef4adf1f 2340 rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond);
74f9986e 2341
ef4adf1f
AS
2342 rtx cmp_rtx;
2343 if (remain == 0 && !equality_compare_rest)
2344 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
2345 else
2346 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
74f9986e 2347
ef4adf1f
AS
2348 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2349 lab_ref, pc_rtx);
faaeebd6
AS
2350 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2351 add_reg_br_prob_note (j, profile_probability::unlikely ());
ef4adf1f
AS
2352 JUMP_LABEL (j) = dst_label;
2353 LABEL_NUSES (dst_label) += 1;
74f9986e
AS
2354 }
2355
2356 offset += cmp_bytes;
2357 bytes_to_compare -= cmp_bytes;
2358 }
2359
9d36bd3b
AS
2360 *p_cleanup_label = cleanup_label;
2361 return;
2362}
2363
f7e94dfb
AS
2364/* Generate the final sequence that identifies the differing
2365 byte and generates the final result, taking into account
2366 zero bytes:
ef4adf1f 2367
f7e94dfb
AS
2368 cntlzd get bit of first zero/diff byte
2369 addi convert for rldcl use
2370 rldcl rldcl extract diff/zero byte
2371 subf subtract for final result
2372
2373 STR1 is the reg rtx for data from string 1.
2374 STR2 is the reg rtx for data from string 2.
2375 RESULT is the reg rtx for the comparison result. */
2376
2377static void
2378emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
2379{
2380 machine_mode m = GET_MODE (str1);
f7e94dfb 2381 rtx rot_amt = gen_reg_rtx (m);
f7e94dfb
AS
2382
2383 rtx rot1_1 = gen_reg_rtx (m);
2384 rtx rot1_2 = gen_reg_rtx (m);
2385 rtx rot2_1 = gen_reg_rtx (m);
2386 rtx rot2_2 = gen_reg_rtx (m);
2387
2388 if (m == SImode)
2389 {
ef4adf1f 2390 emit_insn (gen_clzsi2 (rot_amt, result));
f7e94dfb
AS
2391 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
2392 emit_insn (gen_rotlsi3 (rot1_1, str1,
2393 gen_lowpart (SImode, rot_amt)));
2394 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2395 emit_insn (gen_rotlsi3 (rot2_1, str2,
2396 gen_lowpart (SImode, rot_amt)));
2397 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2398 emit_insn (gen_subsi3 (result, rot1_2, rot2_2));
2399 }
2400 else if (m == DImode)
2401 {
ef4adf1f 2402 emit_insn (gen_clzdi2 (rot_amt, result));
f7e94dfb
AS
2403 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
2404 emit_insn (gen_rotldi3 (rot1_1, str1,
2405 gen_lowpart (SImode, rot_amt)));
2406 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2407 emit_insn (gen_rotldi3 (rot2_1, str2,
2408 gen_lowpart (SImode, rot_amt)));
2409 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2410 emit_insn (gen_subdi3 (result, rot1_2, rot2_2));
2411 }
2412 else
2413 gcc_unreachable ();
ef4adf1f 2414
f7e94dfb
AS
2415 return;
2416}
2417
8845cb37 2418/* Expand a string compare operation with length, and return
ef4adf1f 2419 true if successful. Return false if we should let the
8845cb37
AS
2420 compiler generate normal code, probably a strncmp call.
2421
2422 OPERANDS[0] is the target (result).
2423 OPERANDS[1] is the first source.
2424 OPERANDS[2] is the second source.
2425 If NO_LENGTH is zero, then:
2426 OPERANDS[3] is the length.
2427 OPERANDS[4] is the alignment in bytes.
2428 If NO_LENGTH is nonzero, then:
2429 OPERANDS[3] is the alignment in bytes. */
2430bool
2431expand_strn_compare (rtx operands[], int no_length)
2432{
2433 rtx target = operands[0];
2434 rtx orig_src1 = operands[1];
2435 rtx orig_src2 = operands[2];
2436 rtx bytes_rtx, align_rtx;
2437 if (no_length)
2438 {
2439 bytes_rtx = NULL;
2440 align_rtx = operands[3];
2441 }
2442 else
2443 {
2444 bytes_rtx = operands[3];
2445 align_rtx = operands[4];
2446 }
74f9986e 2447
f7e94dfb
AS
2448 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2449 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
8845cb37 2450
ef4adf1f 2451 /* If we have a length, it must be constant. This simplifies things
8845cb37 2452 a bit as we don't have to generate code to check if we've exceeded
ef4adf1f 2453 the length. Later this could be expanded to handle this case. */
8845cb37
AS
2454 if (!no_length && !CONST_INT_P (bytes_rtx))
2455 return false;
2456
2457 /* This must be a fixed size alignment. */
2458 if (!CONST_INT_P (align_rtx))
2459 return false;
2460
2461 unsigned int base_align = UINTVAL (align_rtx);
f7e94dfb
AS
2462 unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
2463 unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
8845cb37 2464
e0bd6c9f
RS
2465 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
2466 if (targetm.slow_unaligned_access (word_mode, align1)
2467 || targetm.slow_unaligned_access (word_mode, align2))
8845cb37
AS
2468 return false;
2469
2470 gcc_assert (GET_MODE (target) == SImode);
2471
9d36bd3b 2472 unsigned int required_align = 8;
8845cb37
AS
2473
2474 unsigned HOST_WIDE_INT offset = 0;
2475 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
2476 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
9d36bd3b 2477
8845cb37 2478 if (no_length)
9d36bd3b 2479 bytes = rs6000_string_compare_inline_limit;
8845cb37
AS
2480 else
2481 bytes = UINTVAL (bytes_rtx);
2482
ef4adf1f 2483 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
9d36bd3b
AS
2484 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2485 at least POWER8. That way we can rely on overlapping compares to
6bd2b8ec
AS
2486 do the final comparison of less than 16 bytes. Also I do not
2487 want to deal with making this work for 32 bits. In addition, we
2488 have to make sure that we have at least P8_VECTOR (we don't allow
2489 P9_VECTOR without P8_VECTOR). */
2490 int use_vec = (bytes >= 16 && !TARGET_32BIT
2491 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
9d36bd3b
AS
2492
2493 if (use_vec)
2494 required_align = 16;
2495
2496 machine_mode load_mode;
2497 rtx tmp_reg_src1, tmp_reg_src2;
2498 if (use_vec)
2499 {
2500 load_mode = V16QImode;
2501 tmp_reg_src1 = gen_reg_rtx (V16QImode);
2502 tmp_reg_src2 = gen_reg_rtx (V16QImode);
2503 }
2504 else
2505 {
2506 load_mode = select_block_compare_mode (0, bytes, base_align);
2507 tmp_reg_src1 = gen_reg_rtx (word_mode);
2508 tmp_reg_src2 = gen_reg_rtx (word_mode);
2509 }
2510
2511 compare_length = rs6000_string_compare_inline_limit;
8845cb37
AS
2512
2513 /* If we have equality at the end of the last compare and we have not
2514 found the end of the string, we need to call strcmp/strncmp to
2515 compare the remainder. */
2516 bool equality_compare_rest = false;
2517
2518 if (no_length)
2519 {
2520 bytes = compare_length;
2521 equality_compare_rest = true;
2522 }
2523 else
2524 {
2525 if (bytes <= compare_length)
2526 compare_length = bytes;
2527 else
2528 equality_compare_rest = true;
2529 }
2530
2531 rtx result_reg = gen_reg_rtx (word_mode);
2532 rtx final_move_label = gen_label_rtx ();
2533 rtx final_label = gen_label_rtx ();
2534 rtx begin_compare_label = NULL;
ef4adf1f 2535
f7e94dfb 2536 if (base_align < required_align)
8845cb37
AS
2537 {
2538 /* Generate code that checks distance to 4k boundary for this case. */
2539 begin_compare_label = gen_label_rtx ();
2540 rtx strncmp_label = gen_label_rtx ();
2541 rtx jmp;
2542
2543 /* Strncmp for power8 in glibc does this:
5ec3397e
AS
2544 rldicl r8,r3,0,52
2545 cmpldi cr7,r8,4096-16
2546 bgt cr7,L(pagecross) */
8845cb37
AS
2547
2548 /* Make sure that the length we use for the alignment test and
2549 the subsequent code generation are in agreement so we do not
2550 go past the length we tested for a 4k boundary crossing. */
2551 unsigned HOST_WIDE_INT align_test = compare_length;
9d36bd3b 2552 if (align_test < required_align)
8845cb37
AS
2553 {
2554 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
2555 base_align = align_test;
2556 }
2557 else
2558 {
f7e94dfb
AS
2559 align_test = ROUND_UP (align_test, required_align);
2560 base_align = required_align;
8845cb37
AS
2561 }
2562
f7e94dfb
AS
2563 if (align1 < required_align)
2564 expand_strncmp_align_check (strncmp_label, src1_addr, align_test);
2565 if (align2 < required_align)
2566 expand_strncmp_align_check (strncmp_label, src2_addr, align_test);
8845cb37
AS
2567
2568 /* Now generate the following sequence:
2569 - branch to begin_compare
2570 - strncmp_label
2571 - call to strncmp
2572 - branch to final_label
2573 - begin_compare_label */
2574
2575 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
2576 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
2577 JUMP_LABEL (jmp) = begin_compare_label;
2578 LABEL_NUSES (begin_compare_label) += 1;
2579 emit_barrier ();
2580
2581 emit_label (strncmp_label);
2582
8845cb37
AS
2583 if (no_length)
2584 {
2585 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2586 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2587 target, LCT_NORMAL, GET_MODE (target),
f7e94dfb
AS
2588 force_reg (Pmode, src1_addr), Pmode,
2589 force_reg (Pmode, src2_addr), Pmode);
8845cb37
AS
2590 }
2591 else
2592 {
2593 /* -m32 -mpowerpc64 results in word_mode being DImode even
9d36bd3b 2594 though otherwise it is 32-bit. The length arg to strncmp
8845cb37 2595 is a size_t which will be the same size as pointers. */
e9727bda
AS
2596 rtx len_rtx = gen_reg_rtx (Pmode);
2597 emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
8845cb37
AS
2598
2599 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2600 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2601 target, LCT_NORMAL, GET_MODE (target),
f7e94dfb
AS
2602 force_reg (Pmode, src1_addr), Pmode,
2603 force_reg (Pmode, src2_addr), Pmode,
e9727bda 2604 len_rtx, Pmode);
8845cb37
AS
2605 }
2606
2607 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2608 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2609 JUMP_LABEL (jmp) = final_label;
2610 LABEL_NUSES (final_label) += 1;
2611 emit_barrier ();
2612 emit_label (begin_compare_label);
2613 }
2614
2615 rtx cleanup_label = NULL;
9d36bd3b 2616 rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL;
8845cb37 2617
f7e94dfb 2618 /* Generate a sequence of GPR or VEC/VSX instructions to compare out
8845cb37 2619 to the length specified. */
9d36bd3b
AS
2620 if (use_vec)
2621 {
2622 s1addr = gen_reg_rtx (Pmode);
2623 s2addr = gen_reg_rtx (Pmode);
2624 off_reg = gen_reg_rtx (Pmode);
2625 vec_result = gen_reg_rtx (load_mode);
2626 emit_move_insn (result_reg, GEN_INT (0));
37ae4739
AS
2627 expand_cmp_vec_sequence (compare_length,
2628 orig_src1, orig_src2,
2629 s1addr, s2addr, off_reg,
2630 tmp_reg_src1, tmp_reg_src2,
2631 vec_result,
2632 equality_compare_rest,
2633 &cleanup_label, final_move_label, true);
9d36bd3b
AS
2634 }
2635 else
2636 expand_strncmp_gpr_sequence (compare_length, base_align,
2637 orig_src1, orig_src2,
2638 tmp_reg_src1, tmp_reg_src2,
2639 result_reg,
2640 equality_compare_rest,
2641 &cleanup_label, final_move_label);
74f9986e
AS
2642
2643 offset = compare_length;
ef4adf1f 2644
8845cb37
AS
2645 if (equality_compare_rest)
2646 {
2647 /* Update pointers past what has been compared already. */
f7e94dfb
AS
2648 rtx src1 = force_reg (Pmode,
2649 gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset)));
2650 rtx src2 = force_reg (Pmode,
2651 gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset)));
8845cb37
AS
2652
2653 /* Construct call to strcmp/strncmp to compare the rest of the string. */
2654 if (no_length)
2655 {
2656 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2657 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2658 target, LCT_NORMAL, GET_MODE (target),
f7e94dfb 2659 src1, Pmode, src2, Pmode);
8845cb37
AS
2660 }
2661 else
2662 {
e9727bda
AS
2663 rtx len_rtx = gen_reg_rtx (Pmode);
2664 emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode));
8845cb37
AS
2665 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2666 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2667 target, LCT_NORMAL, GET_MODE (target),
e9727bda 2668 src1, Pmode, src2, Pmode, len_rtx, Pmode);
8845cb37
AS
2669 }
2670
2671 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2672 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2673 JUMP_LABEL (jmp) = final_label;
2674 LABEL_NUSES (final_label) += 1;
2675 emit_barrier ();
2676 }
2677
2678 if (cleanup_label)
2679 emit_label (cleanup_label);
2680
9d36bd3b 2681 if (use_vec)
37ae4739
AS
2682 emit_final_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg,
2683 s1addr, s2addr, orig_src1, orig_src2,
2684 off_reg, vec_result);
9d36bd3b
AS
2685 else
2686 emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
8845cb37
AS
2687
2688 emit_label (final_move_label);
2689 emit_insn (gen_movsi (target,
2690 gen_lowpart (SImode, result_reg)));
2691 emit_label (final_label);
2692 return true;
2693}
2694
19db0ebb
AS
2695/* Generate loads and stores for a move of v4si mode using lvx/stvx.
2696 This uses altivec_{l,st}vx_<mode>_internal which use unspecs to
2697 keep combine from changing what instruction gets used.
2698
2699 DEST is the destination for the data.
2700 SRC is the source of the data for the move. */
2701
2702static rtx
2703gen_lvx_v4si_move (rtx dest, rtx src)
2704{
2705 gcc_assert (MEM_P (dest) ^ MEM_P (src));
2706 gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
2707
2708 if (MEM_P (dest))
2709 return gen_altivec_stvx_v4si_internal (dest, src);
2710 else
2711 return gen_altivec_lvx_v4si_internal (dest, src);
2712}
2713
afd97163
AS
2714static rtx
2715gen_lxvl_stxvl_move (rtx dest, rtx src, int length)
2716{
2717 gcc_assert (MEM_P (dest) ^ MEM_P (src));
2718 gcc_assert (GET_MODE (dest) == V16QImode && GET_MODE (src) == V16QImode);
2719 gcc_assert (length <= 16);
2720
2721 bool is_store = MEM_P (dest);
2722 rtx addr;
2723
2724 /* If the address form is not a simple register, make it so. */
2725 if (is_store)
2726 addr = XEXP (dest, 0);
2727 else
2728 addr = XEXP (src, 0);
2729
2730 if (!REG_P (addr))
2731 addr = force_reg (Pmode, addr);
2732
2733 rtx len = force_reg (DImode, gen_int_mode (length, DImode));
2734 if (is_store)
2735 return gen_stxvl (src, addr, len);
2736 else
2737 return gen_lxvl (dest, addr, len);
2738}
2739
8845cb37
AS
2740/* Expand a block move operation, and return 1 if successful. Return 0
2741 if we should let the compiler generate normal code.
2742
2743 operands[0] is the destination
2744 operands[1] is the source
2745 operands[2] is the length
2746 operands[3] is the alignment */
2747
2748#define MAX_MOVE_REG 4
2749
2750int
c8241327 2751expand_block_move (rtx operands[], bool might_overlap)
8845cb37
AS
2752{
2753 rtx orig_dest = operands[0];
2754 rtx orig_src = operands[1];
2755 rtx bytes_rtx = operands[2];
2756 rtx align_rtx = operands[3];
2e42a52f 2757 int constp = CONST_INT_P (bytes_rtx);
8845cb37
AS
2758 int align;
2759 int bytes;
2760 int offset;
2761 int move_bytes;
c8241327 2762 rtx loads[MAX_MOVE_REG];
8845cb37
AS
2763 rtx stores[MAX_MOVE_REG];
2764 int num_reg = 0;
2765
2766 /* If this is not a fixed size move, just call memcpy */
2767 if (! constp)
2768 return 0;
2769
2770 /* This must be a fixed size alignment */
2e42a52f 2771 gcc_assert (CONST_INT_P (align_rtx));
8845cb37
AS
2772 align = INTVAL (align_rtx) * BITS_PER_UNIT;
2773
2774 /* Anything to move? */
2775 bytes = INTVAL (bytes_rtx);
2776 if (bytes <= 0)
2777 return 1;
2778
2779 if (bytes > rs6000_block_move_inline_limit)
2780 return 0;
2781
afd97163 2782 int orig_bytes = bytes;
8845cb37
AS
2783 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2784 {
2785 union {
8845cb37 2786 rtx (*mov) (rtx, rtx);
afd97163 2787 rtx (*movlen) (rtx, rtx, int);
8845cb37
AS
2788 } gen_func;
2789 machine_mode mode = BLKmode;
2790 rtx src, dest;
afd97163
AS
2791 bool move_with_length = false;
2792
f8f8909a 2793 /* Use OOmode for paired vsx load/store. Use V2DI for single
afd97163
AS
2794 unaligned vsx load/store, for consistency with what other
2795 expansions (compare) already do, and so we can use lxvd2x on
2796 p8. Order is VSX pair unaligned, VSX unaligned, Altivec, VSX
2797 with length < 16 (if allowed), then gpr load/store. */
2798
2799 if (TARGET_MMA && TARGET_BLOCK_OPS_UNALIGNED_VSX
2800 && TARGET_BLOCK_OPS_VECTOR_PAIR
2801 && bytes >= 32
2802 && (align >= 256 || !STRICT_ALIGNMENT))
2803 {
2804 move_bytes = 32;
f8f8909a
AS
2805 mode = OOmode;
2806 gen_func.mov = gen_movoo;
afd97163
AS
2807 }
2808 else if (TARGET_POWERPC64 && TARGET_BLOCK_OPS_UNALIGNED_VSX
2809 && VECTOR_MEM_VSX_P (V2DImode)
2810 && bytes >= 16 && (align >= 128 || !STRICT_ALIGNMENT))
2811 {
2812 move_bytes = 16;
2813 mode = V2DImode;
2814 gen_func.mov = gen_vsx_movv2di_64bit;
2815 }
2816 else if (TARGET_BLOCK_OPS_UNALIGNED_VSX
946b8967
HG
2817 /* Only use lxvl/stxvl on 64bit POWER10. */
2818 && TARGET_POWER10
2819 && TARGET_64BIT
2820 && bytes < 16
afd97163 2821 && orig_bytes > 16
946b8967
HG
2822 && !(bytes == 1
2823 || bytes == 2
2824 || bytes == 4
2825 || bytes == 8)
2826 && (align >= 128
2827 || !STRICT_ALIGNMENT))
afd97163
AS
2828 {
2829 /* Only use lxvl/stxvl if it could replace multiple ordinary
2830 loads+stores. Also don't use it unless we likely already
2831 did one vsx copy so we aren't mixing gpr and vsx. */
2832 move_bytes = bytes;
2833 mode = V16QImode;
2834 gen_func.movlen = gen_lxvl_stxvl_move;
2835 move_with_length = true;
2836 }
2837 else if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
8845cb37
AS
2838 {
2839 move_bytes = 16;
2840 mode = V4SImode;
19db0ebb 2841 gen_func.mov = gen_lvx_v4si_move;
8845cb37 2842 }
8845cb37
AS
2843 else if (bytes >= 8 && TARGET_POWERPC64
2844 && (align >= 64 || !STRICT_ALIGNMENT))
2845 {
2846 move_bytes = 8;
2847 mode = DImode;
2848 gen_func.mov = gen_movdi;
2849 if (offset == 0 && align < 64)
2850 {
2851 rtx addr;
2852
2853 /* If the address form is reg+offset with offset not a
2854 multiple of four, reload into reg indirect form here
2855 rather than waiting for reload. This way we get one
2856 reload, not one per load and/or store. */
2857 addr = XEXP (orig_dest, 0);
2858 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2e42a52f 2859 && CONST_INT_P (XEXP (addr, 1))
8845cb37
AS
2860 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2861 {
2862 addr = copy_addr_to_reg (addr);
2863 orig_dest = replace_equiv_address (orig_dest, addr);
2864 }
2865 addr = XEXP (orig_src, 0);
2866 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2e42a52f 2867 && CONST_INT_P (XEXP (addr, 1))
8845cb37
AS
2868 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2869 {
2870 addr = copy_addr_to_reg (addr);
2871 orig_src = replace_equiv_address (orig_src, addr);
2872 }
2873 }
2874 }
8845cb37
AS
2875 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2876 { /* move 4 bytes */
2877 move_bytes = 4;
2878 mode = SImode;
2879 gen_func.mov = gen_movsi;
2880 }
2881 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2882 { /* move 2 bytes */
2883 move_bytes = 2;
2884 mode = HImode;
2885 gen_func.mov = gen_movhi;
2886 }
8845cb37
AS
2887 else /* move 1 byte at a time */
2888 {
2889 move_bytes = 1;
2890 mode = QImode;
2891 gen_func.mov = gen_movqi;
2892 }
2893
afd97163
AS
2894 /* If we can't succeed in doing the move in one pass, we can't
2895 do it in the might_overlap case. Bail out and return
2896 failure. We test num_reg + 1 >= MAX_MOVE_REG here to check
2897 the same condition as the test of num_reg >= MAX_MOVE_REG
2898 that is done below after the increment of num_reg. */
2899 if (might_overlap && num_reg + 1 >= MAX_MOVE_REG
2900 && bytes > move_bytes)
2901 return 0;
2902
2903 /* Mode is always set to something other than BLKmode by one of the
c8241327
AS
2904 cases of the if statement above. */
2905 gcc_assert (mode != BLKmode);
2906
8845cb37
AS
2907 src = adjust_address (orig_src, mode, offset);
2908 dest = adjust_address (orig_dest, mode, offset);
2909
c8241327 2910 rtx tmp_reg = gen_reg_rtx (mode);
8845cb37 2911
afd97163
AS
2912 if (move_with_length)
2913 {
2914 loads[num_reg] = (*gen_func.movlen) (tmp_reg, src, move_bytes);
2915 stores[num_reg++] = (*gen_func.movlen) (dest, tmp_reg, move_bytes);
2916 }
2917 else
2918 {
2919 loads[num_reg] = (*gen_func.mov) (tmp_reg, src);
2920 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2921 }
8845cb37 2922
c8241327
AS
2923 /* Emit loads and stores saved up. */
2924 if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
8845cb37
AS
2925 {
2926 int i;
c8241327
AS
2927 for (i = 0; i < num_reg; i++)
2928 emit_insn (loads[i]);
8845cb37
AS
2929 for (i = 0; i < num_reg; i++)
2930 emit_insn (stores[i]);
2931 num_reg = 0;
2932 }
c8241327 2933
8845cb37
AS
2934 }
2935
2936 return 1;
2937}