]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/rs6000/rs6000-string.c
Although there's no fundamental reason why shrink wrapping and speculation tracking...
[thirdparty/gcc.git] / gcc / config / rs6000 / rs6000-string.c
CommitLineData
8845cb37
AS
1/* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
85ec4feb 3 Copyright (C) 1991-2018 Free Software Foundation, Inc.
8845cb37
AS
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
8845cb37
AS
23#include "config.h"
24#include "system.h"
25#include "coretypes.h"
26#include "backend.h"
27#include "rtl.h"
28#include "tree.h"
29#include "memmodel.h"
30#include "tm_p.h"
31#include "ira.h"
32#include "print-tree.h"
33#include "varasm.h"
34#include "explow.h"
35#include "expr.h"
36#include "output.h"
e0bd6c9f 37#include "target.h"
8845cb37
AS
38
39/* Expand a block clear operation, and return 1 if successful. Return 0
40 if we should let the compiler generate normal code.
41
42 operands[0] is the destination
43 operands[1] is the length
44 operands[3] is the alignment */
45
46int
47expand_block_clear (rtx operands[])
48{
49 rtx orig_dest = operands[0];
50 rtx bytes_rtx = operands[1];
51 rtx align_rtx = operands[3];
52 bool constp = (GET_CODE (bytes_rtx) == CONST_INT);
53 HOST_WIDE_INT align;
54 HOST_WIDE_INT bytes;
55 int offset;
56 int clear_bytes;
57 int clear_step;
58
59 /* If this is not a fixed size move, just call memcpy */
60 if (! constp)
61 return 0;
62
63 /* This must be a fixed size alignment */
64 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
65 align = INTVAL (align_rtx) * BITS_PER_UNIT;
66
67 /* Anything to clear? */
68 bytes = INTVAL (bytes_rtx);
69 if (bytes <= 0)
70 return 1;
71
72 /* Use the builtin memset after a point, to avoid huge code bloat.
73 When optimize_size, avoid any significant code bloat; calling
74 memset is about 4 instructions, so allow for one instruction to
75 load zero and three to do clearing. */
3b0cb1a5 76 if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
8845cb37
AS
77 clear_step = 16;
78 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
79 clear_step = 8;
80 else
81 clear_step = 4;
82
83 if (optimize_size && bytes > 3 * clear_step)
84 return 0;
85 if (! optimize_size && bytes > 8 * clear_step)
86 return 0;
87
88 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
89 {
90 machine_mode mode = BLKmode;
91 rtx dest;
92
31369f5a
AS
93 if (TARGET_ALTIVEC
94 && ((bytes >= 16 && align >= 128)
95 || (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX)))
8845cb37
AS
96 {
97 clear_bytes = 16;
98 mode = V4SImode;
99 }
100 else if (bytes >= 8 && TARGET_POWERPC64
101 && (align >= 64 || !STRICT_ALIGNMENT))
102 {
103 clear_bytes = 8;
104 mode = DImode;
105 if (offset == 0 && align < 64)
106 {
107 rtx addr;
108
109 /* If the address form is reg+offset with offset not a
110 multiple of four, reload into reg indirect form here
111 rather than waiting for reload. This way we get one
112 reload, not one per store. */
113 addr = XEXP (orig_dest, 0);
114 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
115 && GET_CODE (XEXP (addr, 1)) == CONST_INT
116 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
117 {
118 addr = copy_addr_to_reg (addr);
119 orig_dest = replace_equiv_address (orig_dest, addr);
120 }
121 }
122 }
123 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
124 { /* move 4 bytes */
125 clear_bytes = 4;
126 mode = SImode;
127 }
128 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
129 { /* move 2 bytes */
130 clear_bytes = 2;
131 mode = HImode;
132 }
133 else /* move 1 byte at a time */
134 {
135 clear_bytes = 1;
136 mode = QImode;
137 }
138
139 dest = adjust_address (orig_dest, mode, offset);
140
141 emit_move_insn (dest, CONST0_RTX (mode));
142 }
143
144 return 1;
145}
146
147/* Figure out the correct instructions to generate to load data for
148 block compare. MODE is used for the read from memory, and
149 data is zero extended if REG is wider than MODE. If LE code
150 is being generated, bswap loads are used.
151
152 REG is the destination register to move the data into.
153 MEM is the memory block being read.
154 MODE is the mode of memory to use for the read. */
155static void
156do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
157{
158 switch (GET_MODE (reg))
159 {
9d36bd3b
AS
160 case E_V16QImode:
161 switch (mode)
162 {
163 case E_V16QImode:
164 if (!BYTES_BIG_ENDIAN)
165 {
166 if (TARGET_P9_VECTOR)
167 emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem));
168 else
169 {
170 rtx reg_v2di = simplify_gen_subreg (V2DImode, reg,
171 V16QImode, 0);
172 gcc_assert (MEM_P (mem));
173 rtx addr = XEXP (mem, 0);
174 rtx mem_v2di = gen_rtx_MEM (V2DImode, addr);
175 MEM_COPY_ATTRIBUTES (mem_v2di, mem);
176 set_mem_size (mem, GET_MODE_SIZE (V2DImode));
177 emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di));
178 }
179 }
180 else
181 emit_insn (gen_vsx_movv2di_64bit (reg, mem));
182 break;
183 default:
184 gcc_unreachable ();
185 }
186 break;
4e10a5a7 187 case E_DImode:
8845cb37
AS
188 switch (mode)
189 {
4e10a5a7 190 case E_QImode:
8845cb37
AS
191 emit_insn (gen_zero_extendqidi2 (reg, mem));
192 break;
4e10a5a7 193 case E_HImode:
8845cb37
AS
194 {
195 rtx src = mem;
196 if (!BYTES_BIG_ENDIAN)
197 {
198 src = gen_reg_rtx (HImode);
199 emit_insn (gen_bswaphi2 (src, mem));
200 }
201 emit_insn (gen_zero_extendhidi2 (reg, src));
202 break;
203 }
4e10a5a7 204 case E_SImode:
8845cb37
AS
205 {
206 rtx src = mem;
207 if (!BYTES_BIG_ENDIAN)
208 {
209 src = gen_reg_rtx (SImode);
210 emit_insn (gen_bswapsi2 (src, mem));
211 }
212 emit_insn (gen_zero_extendsidi2 (reg, src));
213 }
214 break;
4e10a5a7 215 case E_DImode:
8845cb37
AS
216 if (!BYTES_BIG_ENDIAN)
217 emit_insn (gen_bswapdi2 (reg, mem));
218 else
219 emit_insn (gen_movdi (reg, mem));
220 break;
221 default:
222 gcc_unreachable ();
223 }
224 break;
225
4e10a5a7 226 case E_SImode:
8845cb37
AS
227 switch (mode)
228 {
4e10a5a7 229 case E_QImode:
8845cb37
AS
230 emit_insn (gen_zero_extendqisi2 (reg, mem));
231 break;
4e10a5a7 232 case E_HImode:
8845cb37
AS
233 {
234 rtx src = mem;
235 if (!BYTES_BIG_ENDIAN)
236 {
237 src = gen_reg_rtx (HImode);
238 emit_insn (gen_bswaphi2 (src, mem));
239 }
240 emit_insn (gen_zero_extendhisi2 (reg, src));
241 break;
242 }
4e10a5a7 243 case E_SImode:
8845cb37
AS
244 if (!BYTES_BIG_ENDIAN)
245 emit_insn (gen_bswapsi2 (reg, mem));
246 else
247 emit_insn (gen_movsi (reg, mem));
248 break;
4e10a5a7 249 case E_DImode:
8845cb37
AS
250 /* DImode is larger than the destination reg so is not expected. */
251 gcc_unreachable ();
252 break;
253 default:
254 gcc_unreachable ();
255 }
256 break;
9d36bd3b
AS
257
258 case E_QImode:
259 gcc_assert (mode == E_QImode);
260 emit_move_insn (reg, mem);
261 break;
ef4adf1f 262
8845cb37
AS
263 default:
264 gcc_unreachable ();
265 break;
266 }
267}
268
269/* Select the mode to be used for reading the next chunk of bytes
270 in the compare.
271
272 OFFSET is the current read offset from the beginning of the block.
273 BYTES is the number of bytes remaining to be read.
74f9986e 274 ALIGN is the minimum alignment of the memory blocks being compared in bytes. */
8845cb37
AS
275static machine_mode
276select_block_compare_mode (unsigned HOST_WIDE_INT offset,
277 unsigned HOST_WIDE_INT bytes,
74f9986e 278 unsigned HOST_WIDE_INT align)
8845cb37
AS
279{
280 /* First see if we can do a whole load unit
281 as that will be more efficient than a larger load + shift. */
282
283 /* If big, use biggest chunk.
284 If exactly chunk size, use that size.
285 If remainder can be done in one piece with shifting, do that.
286 Do largest chunk possible without violating alignment rules. */
287
288 /* The most we can read without potential page crossing. */
289 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
290
74f9986e
AS
291 /* If we have an LE target without ldbrx and word_mode is DImode,
292 then we must avoid using word_mode. */
293 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
294 && word_mode == DImode);
295
8845cb37
AS
296 if (word_mode_ok && bytes >= UNITS_PER_WORD)
297 return word_mode;
298 else if (bytes == GET_MODE_SIZE (SImode))
299 return SImode;
300 else if (bytes == GET_MODE_SIZE (HImode))
301 return HImode;
302 else if (bytes == GET_MODE_SIZE (QImode))
303 return QImode;
304 else if (bytes < GET_MODE_SIZE (SImode)
f7e94dfb 305 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
8845cb37
AS
306 && offset >= GET_MODE_SIZE (SImode) - bytes)
307 /* This matches the case were we have SImode and 3 bytes
308 and offset >= 1 and permits us to move back one and overlap
309 with the previous read, thus avoiding having to shift
310 unwanted bytes off of the input. */
311 return SImode;
312 else if (word_mode_ok && bytes < UNITS_PER_WORD
f7e94dfb 313 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
8845cb37
AS
314 && offset >= UNITS_PER_WORD-bytes)
315 /* Similarly, if we can use DImode it will get matched here and
316 can do an overlapping read that ends at the end of the block. */
317 return word_mode;
318 else if (word_mode_ok && maxread >= UNITS_PER_WORD)
319 /* It is safe to do all remaining in one load of largest size,
320 possibly with a shift to get rid of unwanted bytes. */
321 return word_mode;
322 else if (maxread >= GET_MODE_SIZE (SImode))
323 /* It is safe to do all remaining in one SImode load,
324 possibly with a shift to get rid of unwanted bytes. */
325 return SImode;
326 else if (bytes > GET_MODE_SIZE (SImode))
327 return SImode;
328 else if (bytes > GET_MODE_SIZE (HImode))
329 return HImode;
330
331 /* final fallback is do one byte */
332 return QImode;
333}
334
335/* Compute the alignment of pointer+OFFSET where the original alignment
336 of pointer was BASE_ALIGN. */
337static unsigned HOST_WIDE_INT
338compute_current_alignment (unsigned HOST_WIDE_INT base_align,
339 unsigned HOST_WIDE_INT offset)
340{
341 if (offset == 0)
342 return base_align;
343 return MIN (base_align, offset & -offset);
344}
345
5ec3397e
AS
346/* Prepare address and then do a load.
347
348 MODE is the mode to use for the load.
349 DEST is the destination register for the data.
350 ADDR is the address to be loaded.
351 ORIG_ADDR is the original address expression. */
352static void
353do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
354 rtx orig_addr)
355{
356 rtx mem = gen_rtx_MEM (mode, addr);
357 MEM_COPY_ATTRIBUTES (mem, orig_addr);
358 set_mem_size (mem, GET_MODE_SIZE (mode));
359 do_load_for_compare (dest, mem, mode);
360 return;
361}
362
363/* Do a branch for an if/else decision.
364
365 CMPMODE is the mode to use for the comparison.
366 COMPARISON is the rtx code for the compare needed.
367 A is the first thing to be compared.
368 B is the second thing to be compared.
369 CR is the condition code reg input, or NULL_RTX.
370 TRUE_LABEL is the label to branch to if the condition is true.
371
372 The return value is the CR used for the comparison.
373 If CR is null_rtx, then a new register of CMPMODE is generated.
374 If A and B are both null_rtx, then CR must not be null, and the
375 compare is not generated so you can use this with a dot form insn. */
376
377static void
378do_ifelse (machine_mode cmpmode, rtx_code comparison,
379 rtx a, rtx b, rtx cr, rtx true_label)
380{
381 gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
382 || (a != NULL_RTX && b != NULL_RTX));
383
384 if (cr != NULL_RTX)
385 gcc_assert (GET_MODE (cr) == cmpmode);
386 else
387 cr = gen_reg_rtx (cmpmode);
388
389 rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
390
391 if (a != NULL_RTX)
392 emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
393
394 rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
395
396 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
397 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
398 JUMP_LABEL (j) = true_label;
399 LABEL_NUSES (true_label) += 1;
400}
401
402/* Emit an isel of the proper mode for DEST.
403
404 DEST is the isel destination register.
405 SRC1 is the isel source if CR is true.
406 SRC2 is the isel source if CR is false.
407 CR is the condition for the isel. */
408static void
409do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
410{
411 if (GET_MODE (dest) == DImode)
412 emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
413 else
414 emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
415}
416
417/* Emit a subtract of the proper mode for DEST.
418
419 DEST is the destination register for the subtract.
420 SRC1 is the first subtract input.
421 SRC2 is the second subtract input.
422
423 Computes DEST = SRC1-SRC2. */
424static void
425do_sub3 (rtx dest, rtx src1, rtx src2)
426{
427 if (GET_MODE (dest) == DImode)
428 emit_insn (gen_subdi3 (dest, src1, src2));
429 else
430 emit_insn (gen_subsi3 (dest, src1, src2));
431}
432
433/* Emit an add of the proper mode for DEST.
434
435 DEST is the destination register for the add.
436 SRC1 is the first add input.
437 SRC2 is the second add input.
438
439 Computes DEST = SRC1+SRC2. */
440static void
441do_add3 (rtx dest, rtx src1, rtx src2)
442{
443 if (GET_MODE (dest) == DImode)
444 emit_insn (gen_adddi3 (dest, src1, src2));
445 else
446 emit_insn (gen_addsi3 (dest, src1, src2));
447}
448
f7e94dfb
AS
449/* Emit an and of the proper mode for DEST.
450
451 DEST is the destination register for the and.
452 SRC1 is the first and input.
453 SRC2 is the second and input.
454
455 Computes DEST = SRC1&SRC2. */
456static void
457do_and3 (rtx dest, rtx src1, rtx src2)
458{
459 if (GET_MODE (dest) == DImode)
460 emit_insn (gen_anddi3 (dest, src1, src2));
461 else
462 emit_insn (gen_andsi3 (dest, src1, src2));
463}
464
465/* Emit an cmpb of the proper mode for DEST.
466
467 DEST is the destination register for the cmpb.
468 SRC1 is the first input.
469 SRC2 is the second input.
470
471 Computes cmpb of SRC1, SRC2. */
472static void
473do_cmpb3 (rtx dest, rtx src1, rtx src2)
474{
475 if (GET_MODE (dest) == DImode)
476 emit_insn (gen_cmpbdi3 (dest, src1, src2));
477 else
478 emit_insn (gen_cmpbsi3 (dest, src1, src2));
479}
480
481/* Emit a rotl of the proper mode for DEST.
482
483 DEST is the destination register for the and.
484 SRC1 is the first and input.
485 SRC2 is the second and input.
486
487 Computes DEST = SRC1 rotated left by SRC2. */
488static void
489do_rotl3 (rtx dest, rtx src1, rtx src2)
490{
491 if (GET_MODE (dest) == DImode)
492 emit_insn (gen_rotldi3 (dest, src1, src2));
493 else
494 emit_insn (gen_rotlsi3 (dest, src1, src2));
495}
496
5ec3397e
AS
497/* Generate rtl for a load, shift, and compare of less than a full word.
498
499 LOAD_MODE is the machine mode for the loads.
500 DIFF is the reg for the difference.
501 CMP_REM is the reg containing the remaining bytes to compare.
502 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
503 SRC1_ADDR is the first source address.
504 SRC2_ADDR is the second source address.
505 ORIG_SRC1 is the original first source block's address rtx.
506 ORIG_SRC2 is the original second source block's address rtx. */
507static void
508do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
509 rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
510{
511 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
512 rtx shift_amount = gen_reg_rtx (word_mode);
513 rtx d1 = gen_reg_rtx (word_mode);
514 rtx d2 = gen_reg_rtx (word_mode);
515
516 do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
517 do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
518 do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
519
520 if (word_mode == DImode)
521 {
522 emit_insn (gen_ashldi3 (shift_amount, shift_amount,
523 GEN_INT (LOG2_BITS_PER_UNIT)));
524 emit_insn (gen_lshrdi3 (d1, d1,
525 gen_lowpart (SImode, shift_amount)));
526 emit_insn (gen_lshrdi3 (d2, d2,
527 gen_lowpart (SImode, shift_amount)));
528 }
529 else
530 {
531 emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
532 GEN_INT (LOG2_BITS_PER_UNIT)));
533 emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
534 emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
535 }
536
537 if (TARGET_P9_MISC)
538 {
539 /* Generate a compare, and convert with a setb later. */
540 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
541 emit_insn (gen_rtx_SET (dcond, cmp));
542 }
543 else
544 {
545 if (word_mode == DImode)
546 emit_insn (gen_subfdi3_carry (diff, d2, d1));
547 else
548 emit_insn (gen_subfsi3_carry (diff, d2, d1));
549 }
550}
551
552/* Generate rtl for an overlapping load and compare of less than a
553 full load_mode. This assumes that the previous word is part of the
554 block being compared so it's ok to back up part of a word so we can
555 compare the last unaligned full word that ends at the end of the block.
556
557 LOAD_MODE is the machine mode for the loads.
558 ISCONST tells whether the remaining length is a constant or in a register.
559 BYTES_REM is the remaining length if ISCONST is true.
560 DIFF is the reg for the difference.
561 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
562 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
563 SRC1_ADDR is the first source address.
564 SRC2_ADDR is the second source address.
565 ORIG_SRC1 is the original first source block's address rtx.
566 ORIG_SRC2 is the original second source block's address rtx. */
567static void
568do_overlap_load_compare (machine_mode load_mode, bool isConst,
569 HOST_WIDE_INT bytes_rem, rtx diff,
570 rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
571 rtx orig_src1, rtx orig_src2)
572{
573 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
574 HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
575 rtx d1 = gen_reg_rtx (word_mode);
576 rtx d2 = gen_reg_rtx (word_mode);
577
578 rtx addr1, addr2;
579 if (!isConst || addr_adj)
580 {
581 rtx adj_reg = gen_reg_rtx (word_mode);
582 if (isConst)
583 emit_move_insn (adj_reg, GEN_INT (-addr_adj));
584 else
585 {
586 rtx reg_lms = gen_reg_rtx (word_mode);
587 emit_move_insn (reg_lms, GEN_INT (load_mode_size));
588 do_sub3 (adj_reg, cmp_rem, reg_lms);
589 }
590
591 addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
592 addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
593 }
594 else
595 {
596 addr1 = src1_addr;
597 addr2 = src2_addr;
598 }
599
600 do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
601 do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
602
603 if (TARGET_P9_MISC)
604 {
605 /* Generate a compare, and convert with a setb later. */
606 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
607 emit_insn (gen_rtx_SET (dcond, cmp));
608 }
609 else
610 {
611 if (word_mode == DImode)
612 emit_insn (gen_subfdi3_carry (diff, d2, d1));
613 else
614 emit_insn (gen_subfsi3_carry (diff, d2, d1));
615 }
616}
617
618/* Expand a block compare operation using loop code, and return true
619 if successful. Return false if we should let the compiler generate
620 normal code, probably a memcmp call.
621
622 OPERANDS[0] is the target (result).
623 OPERANDS[1] is the first source.
624 OPERANDS[2] is the second source.
625 OPERANDS[3] is the length.
626 OPERANDS[4] is the alignment. */
627bool
628expand_compare_loop (rtx operands[])
629{
630 rtx target = operands[0];
631 rtx orig_src1 = operands[1];
632 rtx orig_src2 = operands[2];
633 rtx bytes_rtx = operands[3];
634 rtx align_rtx = operands[4];
635
636 /* This case is complicated to handle because the subtract
637 with carry instructions do not generate the 64-bit
638 carry and so we must emit code to calculate it ourselves.
639 We choose not to implement this yet. */
640 if (TARGET_32BIT && TARGET_POWERPC64)
641 return false;
642
643 /* Allow non-const length. */
644 int bytes_is_const = CONST_INT_P (bytes_rtx);
645
646 /* This must be a fixed size alignment. */
647 if (!CONST_INT_P (align_rtx))
648 return false;
649
650 HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
651 HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
652 HOST_WIDE_INT minalign = MIN (align1, align2);
653
654 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
655
656 gcc_assert (GET_MODE (target) == SImode);
657
658 /* Anything to move? */
659 HOST_WIDE_INT bytes = 0;
660 if (bytes_is_const)
661 bytes = INTVAL (bytes_rtx);
662
663 if (bytes_is_const && bytes == 0)
664 return true;
665
666 /* Limit the amount we compare, if known statically. */
667 HOST_WIDE_INT max_bytes;
668 switch (rs6000_tune)
669 {
670 case PROCESSOR_POWER7:
671 if (!bytes_is_const)
672 if (minalign < 8)
673 max_bytes = 0;
674 else
675 max_bytes = 128;
676 else
677 if (minalign < 8)
678 max_bytes = 32;
679 else
680 max_bytes = 128;
681 break;
682 case PROCESSOR_POWER8:
683 if (!bytes_is_const)
684 max_bytes = 0;
685 else
686 if (minalign < 8)
687 max_bytes = 128;
688 else
689 max_bytes = 64;
690 break;
691 case PROCESSOR_POWER9:
692 if (bytes_is_const)
693 max_bytes = 191;
694 else
695 max_bytes = 0;
696 break;
697 default:
698 max_bytes = 128;
699 }
700
701 /* Allow the option to override the default. */
702 if (rs6000_block_compare_inline_loop_limit >= 0)
703 max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
704
705 if (max_bytes == 0)
706 return false;
707
708 rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */
709 rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */
710 HOST_WIDE_INT niter;
711 rtx iter = gen_reg_rtx (word_mode);
712 rtx iv1 = gen_reg_rtx (word_mode);
713 rtx iv2 = gen_reg_rtx (word_mode);
714 rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */
715 rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */
716 rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */
717 rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */
718
719 /* Strip unneeded subreg from length if there is one. */
720 if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
721 bytes_rtx = SUBREG_REG (bytes_rtx);
722 /* Extend bytes_rtx to word_mode if needed. But, we expect only to
723 maybe have to deal with the case were bytes_rtx is SImode and
724 word_mode is DImode. */
725 if (!bytes_is_const)
726 {
727 if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
728 /* Do not expect length longer than word_mode. */
ef4adf1f 729 return false;
5ec3397e
AS
730 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
731 {
732 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
733 bytes_rtx = force_reg (word_mode,
734 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
735 bytes_rtx));
736 }
737 else
738 /* Make sure it's in a register before we get started. */
739 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
740 }
741
742 machine_mode load_mode = word_mode;
743 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
744
745 /* Number of bytes per iteration of the unrolled loop. */
746 HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
747 /* max iters and bytes compared in the loop. */
748 HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
749 HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
750 int l2lb = floor_log2 (loop_bytes);
751
752 if (bytes_is_const && (max_bytes < load_mode_size
753 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
754 return false;
755
756 bool no_remainder_code = false;
757 rtx final_label = gen_label_rtx ();
758 rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
759 rtx diff_label = gen_label_rtx ();
760 rtx library_call_label = NULL;
761 rtx cleanup_label = gen_label_rtx ();
762
763 rtx cr;
764
765 rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
766 rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
767
768 /* Difference found is stored here before jump to diff_label. */
769 rtx diff = gen_reg_rtx (word_mode);
770 rtx j;
771
772 /* Example of generated code for 35 bytes aligned 1 byte.
ef4adf1f 773
5ec3397e
AS
774 mtctr 8
775 li 6,0
776 li 5,8
777 .L13:
778 ldbrx 7,3,6
779 ldbrx 9,10,6
780 ldbrx 0,3,5
781 ldbrx 4,10,5
782 addi 6,6,16
783 addi 5,5,16
784 subfc. 9,9,7
785 bne 0,.L10
786 subfc. 9,4,0
787 bdnzt 2,.L13
788 bne 0,.L10
789 add 3,3,6
790 add 10,10,6
791 addi 9,3,-5
792 ldbrx 7,0,9
793 addi 9,10,-5
794 ldbrx 9,0,9
795 subfc 9,9,7
796 .p2align 4,,15
797 .L10:
798 popcntd 9,9
799 subfe 10,10,10
800 or 9,9,10
ef4adf1f 801
5ec3397e
AS
802 Compiled with -fno-reorder-blocks for clarity. */
803
804 /* Structure of what we're going to do:
805 Two separate lengths: what we will compare before bailing to library
806 call (max_bytes), and the total length to be checked.
807 if length <= 16, branch to linear cleanup code starting with
808 remainder length check (length not known at compile time)
809 set up 2 iv's and load count reg, compute remainder length
810 unrollx2 compare loop
811 if loop exit due to a difference, branch to difference handling code
812 if remainder length < 8, branch to final cleanup compare
813 load and compare 8B
814 final cleanup comparison (depends on alignment and length)
815 load 8B, shift off bytes past length, compare
816 load 8B ending at last byte and compare
817 load/compare 1 byte at a time (short block abutting 4k boundary)
818 difference handling, 64->32 conversion
819 final result
820 branch around memcmp call
821 memcmp library call
822 */
823
824 /* If bytes is not const, compare length and branch directly
825 to the cleanup code that can handle 0-16 bytes if length
826 is >= 16. Stash away bytes-max_bytes for the library call. */
827 if (bytes_is_const)
828 {
829 /* These need to be set for some of the places we may jump to. */
830 if (bytes > max_bytes)
831 {
832 no_remainder_code = true;
833 niter = max_loop_iter;
834 library_call_label = gen_label_rtx ();
835 }
836 else
837 {
838 niter = bytes / loop_bytes;
839 }
840 emit_move_insn (iter, GEN_INT (niter));
841 emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
842 emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
843 }
844 else
845 {
846 library_call_label = gen_label_rtx ();
847
848 /* If we go to the cleanup code, it expects length to be in cmp_rem. */
849 emit_move_insn (cmp_rem, bytes_rtx);
850
851 /* Check for > max_bytes bytes. We want to bail out as quickly as
852 possible if we have to go over to memcmp. */
853 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
854 NULL_RTX, library_call_label);
855
856 /* Check for < loop_bytes bytes. */
857 do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
858 NULL_RTX, cleanup_label);
859
860 /* Loop compare bytes and iterations if bytes>max_bytes. */
861 rtx mb_reg = gen_reg_rtx (word_mode);
862 emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
863 rtx mi_reg = gen_reg_rtx (word_mode);
864 emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
865
866 /* Compute number of loop iterations if bytes <= max_bytes. */
867 if (word_mode == DImode)
868 emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
869 else
870 emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
871
872 /* Compute bytes to compare in loop if bytes <= max_bytes. */
873 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
874 if (word_mode == DImode)
875 {
876 emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
877 }
878 else
879 {
880 emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
881 }
882
883 /* Check for bytes <= max_bytes. */
884 if (TARGET_ISEL)
885 {
886 /* P9 has fast isel so we use one compare and two isel. */
887 cr = gen_reg_rtx (CCmode);
888 rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
889 GEN_INT (max_bytes));
890 emit_move_insn (cr, compare_rtx);
891 rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
892 do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
893 do_isel (iter, cmp_rtx, iter, mi_reg, cr);
894 }
895 else
896 {
897 rtx lab_after = gen_label_rtx ();
898 do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
899 NULL_RTX, lab_after);
900 emit_move_insn (loop_cmp, mb_reg);
901 emit_move_insn (iter, mi_reg);
902 emit_label (lab_after);
903 }
904
905 /* Now compute remainder bytes which isn't used until after the loop. */
906 do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
907 }
908
909 rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */
910 /* For p9 we need to have just one of these as multiple places define
911 it and it gets used by the setb at the end. */
912 if (TARGET_P9_MISC)
913 dcond = gen_reg_rtx (CCUNSmode);
914
915 if (!bytes_is_const || bytes >= loop_bytes)
916 {
917 /* It should not be possible to come here if remaining bytes is
918 < 16 in the runtime case either. Compute number of loop
919 iterations. We compare 2*word_mode per iteration so 16B for
920 64-bit code and 8B for 32-bit. Set up two induction
921 variables and load count register. */
922
923 /* HACK ALERT: create hard reg for CTR here. If we just use a
924 pseudo, cse will get rid of it and then the allocator will
925 see it used in the lshr above and won't give us ctr. */
926 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
927 emit_move_insn (ctr, iter);
928 emit_move_insn (diff, GEN_INT (0));
929 emit_move_insn (iv1, GEN_INT (0));
930 emit_move_insn (iv2, GEN_INT (load_mode_size));
931
932 /* inner loop to compare 2*word_mode */
933 rtx loop_top_label = gen_label_rtx ();
934 emit_label (loop_top_label);
935
936 rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
937 rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
938
939 do_load_for_compare_from_addr (load_mode, d1_1,
940 src1_ix1, orig_src1);
941 do_load_for_compare_from_addr (load_mode, d2_1,
942 src2_ix1, orig_src2);
943 do_add3 (iv1, iv1, GEN_INT (loop_bytes));
944
945 rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
946 rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
947
948 do_load_for_compare_from_addr (load_mode, d1_2,
949 src1_ix2, orig_src1);
950 do_load_for_compare_from_addr (load_mode, d2_2,
951 src2_ix2, orig_src2);
952 do_add3 (iv2, iv2, GEN_INT (loop_bytes));
953
954 if (TARGET_P9_MISC)
955 {
956 /* Generate a compare, and convert with a setb later. */
957 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
958 emit_insn (gen_rtx_SET (dcond, cmp));
959 }
960 else
961 {
962 dcond = gen_reg_rtx (CCmode);
963 if (word_mode == DImode)
964 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
965 else
966 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
967 }
968
969 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
970 dcond, diff_label);
971
972 if (TARGET_P9_MISC)
973 {
974 /* Generate a compare, and convert with a setb later. */
975 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
976 emit_insn (gen_rtx_SET (dcond, cmp));
977 }
978 else
979 {
980 dcond = gen_reg_rtx (CCmode);
981 if (word_mode == DImode)
982 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
983 else
984 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
985 }
986
987 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
988 if (TARGET_64BIT)
989 j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
990 eqrtx, dcond));
991 else
992 j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
993 eqrtx, dcond));
994 JUMP_LABEL (j) = loop_top_label;
995 LABEL_NUSES (loop_top_label) += 1;
996 }
997
998 HOST_WIDE_INT bytes_remaining = 0;
999 if (bytes_is_const)
1000 bytes_remaining = (bytes % loop_bytes);
1001
1002 /* If diff is nonzero, branch to difference handling
1003 code. If we exit here with a nonzero diff, it is
1004 because the second word differed. */
1005 if (TARGET_P9_MISC)
1006 do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, diff_label);
1007 else
1008 do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, diff_label);
1009
1010 if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
1011 {
1012 /* If the length is known at compile time, then we will always
1013 have a remainder to go to the library call with. */
1014 rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
1015 j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
1016 JUMP_LABEL (j) = library_call_label;
1017 LABEL_NUSES (library_call_label) += 1;
1018 emit_barrier ();
1019 }
1020
1021 if (bytes_is_const && bytes_remaining == 0)
1022 {
1023 /* No remainder and if we are here then diff is 0 so just return 0 */
1024 if (TARGET_64BIT)
1025 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1026 else
1027 emit_move_insn (target, diff);
1028 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1029 JUMP_LABEL (j) = final_label;
1030 LABEL_NUSES (final_label) += 1;
1031 emit_barrier ();
1032 }
1033 else if (!no_remainder_code)
1034 {
1035 /* Update addresses to point to the next word to examine. */
1036 do_add3 (src1_addr, src1_addr, iv1);
1037 do_add3 (src2_addr, src2_addr, iv1);
1038
1039 emit_label (cleanup_label);
1040
1041 if (!bytes_is_const)
1042 {
1043 /* If we're dealing with runtime length, we have to check if
ef4adf1f 1044 it's zero after the loop. When length is known at compile
5ec3397e
AS
1045 time the no-remainder condition is dealt with above. By
1046 doing this after cleanup_label, we also deal with the
1047 case where length is 0 at the start and we bypass the
1048 loop with a branch to cleanup_label. */
1049 emit_move_insn (target, const0_rtx);
1050 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1051 NULL_RTX, final_label);
1052 }
1053
1054 rtx final_cleanup = gen_label_rtx ();
1055 rtx cmp_rem_before = gen_reg_rtx (word_mode);
1056 /* Compare one more word_mode chunk if needed. */
37ca383f 1057 if (!bytes_is_const || bytes_remaining >= load_mode_size)
5ec3397e
AS
1058 {
1059 /* If remainder length < word length, branch to final
1060 cleanup compare. */
1061 if (!bytes_is_const)
1062 do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
1063 NULL_RTX, final_cleanup);
1064
1065 /* load and compare 8B */
1066 do_load_for_compare_from_addr (load_mode, d1_1,
1067 src1_addr, orig_src1);
1068 do_load_for_compare_from_addr (load_mode, d2_1,
1069 src2_addr, orig_src2);
1070
1071 /* Compare the word, see if we need to do the last partial. */
1072 if (TARGET_P9_MISC)
1073 {
1074 /* Generate a compare, and convert with a setb later. */
1075 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1076 emit_insn (gen_rtx_SET (dcond, cmp));
1077 }
1078 else
1079 {
1080 dcond = gen_reg_rtx (CCmode);
1081 if (word_mode == DImode)
1082 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1083 else
1084 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1085 }
1086
1087 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1088 dcond, diff_label);
1089
1090 do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1091 do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1092 emit_move_insn (cmp_rem_before, cmp_rem);
1093 do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1094 if (bytes_is_const)
1095 bytes_remaining -= load_mode_size;
1096 else
1097 /* See if remaining length is now zero. We previously set
1098 target to 0 so we can just jump to the end. */
1099 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1100 NULL_RTX, final_label);
1101
1102 }
1103
1104 /* Cases:
1105 bytes_is_const
1106 We can always shift back to do an overlapping compare
1107 of the last chunk because we know length >= 8.
1108
1109 !bytes_is_const
1110 align>=load_mode_size
1111 Read word_mode and mask
1112 align<load_mode_size
1113 avoid stepping past end
1114
1115 Three strategies:
1116 * decrement address and do overlapping compare
1117 * read word_mode and mask
1118 * carefully avoid crossing 4k boundary
1119 */
1120
1121 if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1122 && align1 >= load_mode_size && align2 >= load_mode_size)
1123 {
1124 /* Alignment is larger than word_mode so we do not need to be
1125 concerned with extra page crossings. But, we do not know
1126 that the length is larger than load_mode_size so we might
1127 end up compareing against data before the block if we try
1128 an overlapping compare. Also we use this on P7 for fixed length
1129 remainder because P7 doesn't like overlapping unaligned.
1130 Strategy: load 8B, shift off bytes past length, and compare. */
1131 emit_label (final_cleanup);
1132 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1133 src1_addr, src2_addr, orig_src1, orig_src2);
1134 }
1135 else if (bytes_remaining && bytes_is_const)
1136 {
1137 /* We do not do loop expand if length < 32 so we know at the
1138 end we can do an overlapping compare.
1139 Strategy: shift address back and do word_mode load that
1140 ends at the end of the block. */
1141 emit_label (final_cleanup);
1142 do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1143 cmp_rem, dcond, src1_addr, src2_addr,
1144 orig_src1, orig_src2);
1145 }
1146 else if (!bytes_is_const)
1147 {
1148 rtx handle4k_label = gen_label_rtx ();
1149 rtx nonconst_overlap = gen_label_rtx ();
1150 emit_label (nonconst_overlap);
1151
1152 /* Here we have to handle the case where whe have runtime
1153 length which may be too short for overlap compare, and
1154 alignment is not at least load_mode_size so we have to
1155 tread carefully to avoid stepping across 4k boundaries. */
1156
1157 /* If the length after the loop was larger than word_mode
1158 size, we can just do an overlapping compare and we're
1159 done. We fall through to this code from the word_mode
1160 compare that preceeds this. */
1161 do_overlap_load_compare (load_mode, false, 0, diff,
1162 cmp_rem, dcond, src1_addr, src2_addr,
1163 orig_src1, orig_src2);
1164
1165 rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1166 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1167 JUMP_LABEL (j) = diff_label;
1168 LABEL_NUSES (diff_label) += 1;
1169 emit_barrier ();
1170
1171 /* If we couldn't do the overlap compare we have to be more
1172 careful of the 4k boundary. Test to see if either
1173 address is less than word_mode_size away from a 4k
1174 boundary. If not, then we can do a load/shift/compare
1175 and we are done. We come to this code if length was less
1176 than word_mode_size. */
1177
1178 emit_label (final_cleanup);
1179
1180 /* We can still avoid the slow case if the length was larger
1181 than one loop iteration, in which case go do the overlap
1182 load compare path. */
1183 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
1184 NULL_RTX, nonconst_overlap);
1185
1186 rtx rem4k = gen_reg_rtx (word_mode);
1187 rtx dist1 = gen_reg_rtx (word_mode);
1188 rtx dist2 = gen_reg_rtx (word_mode);
1189 do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1190 if (word_mode == SImode)
1191 emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1192 else
1193 emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
1194 do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, handle4k_label);
1195 if (word_mode == SImode)
1196 emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1197 else
1198 emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
1199 do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, handle4k_label);
1200
1201 /* We don't have a 4k boundary to deal with, so do
1202 a load/shift/compare and jump to diff. */
1203
1204 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1205 src1_addr, src2_addr, orig_src1, orig_src2);
1206
1207 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1208 JUMP_LABEL (j) = diff_label;
1209 LABEL_NUSES (diff_label) += 1;
1210 emit_barrier ();
1211
1212 /* Finally in the unlikely case we are inching up to a
1213 4k boundary we use a compact lbzx/compare loop to do
1214 it a byte at a time. */
1215
1216 emit_label (handle4k_label);
1217
1218 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1219 emit_move_insn (ctr, cmp_rem);
1220 rtx ixreg = gen_reg_rtx (Pmode);
1221 emit_move_insn (ixreg, const0_rtx);
1222
1223 rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1224 rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1225 rtx d1 = gen_reg_rtx (word_mode);
1226 rtx d2 = gen_reg_rtx (word_mode);
1227
1228 rtx fc_loop = gen_label_rtx ();
1229 emit_label (fc_loop);
1230
1231 do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1232 do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1233
1234 do_add3 (ixreg, ixreg, const1_rtx);
1235
1236 rtx cond = gen_reg_rtx (CCmode);
1237 rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1238 rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1239
1240 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1241 if (TARGET_64BIT)
1242 j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1243 eqrtx, cond));
1244 else
1245 j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1246 eqrtx, cond));
1247 JUMP_LABEL (j) = fc_loop;
1248 LABEL_NUSES (fc_loop) += 1;
1249
1250 if (TARGET_64BIT)
1251 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1252 else
1253 emit_move_insn (target, diff);
1254
1255 /* Since we are comparing bytes, the difference can be used
1256 as the final result and we are done here. */
1257 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1258 JUMP_LABEL (j) = final_label;
1259 LABEL_NUSES (final_label) += 1;
1260 emit_barrier ();
1261 }
1262 }
1263
1264 emit_label (diff_label);
1265 /* difference handling, 64->32 conversion */
1266
1267 /* We need to produce DI result from sub, then convert to target SI
1268 while maintaining <0 / ==0 / >0 properties. This sequence works:
1269 subfc L,A,B
1270 subfe H,H,H
1271 popcntd L,L
1272 rldimi L,H,6,0
1273
1274 This is an alternate one Segher cooked up if somebody
1275 wants to expand this for something that doesn't have popcntd:
1276 subfc L,a,b
1277 subfe H,x,x
1278 addic t,L,-1
1279 subfe v,t,L
1280 or z,v,H
1281
1282 And finally, p9 can just do this:
1283 cmpld A,B
1284 setb r */
1285
1286 if (TARGET_P9_MISC)
1287 emit_insn (gen_setb_unsigned (target, dcond));
1288 else
1289 {
1290 if (TARGET_64BIT)
1291 {
1292 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1293 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1294 emit_insn (gen_popcntddi2 (diff, diff));
1295 emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1296 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1297 }
1298 else
1299 {
1300 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1301 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1302 emit_insn (gen_popcntdsi2 (diff, diff));
1303 emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1304 }
1305 }
1306
1307 if (library_call_label != NULL)
1308 {
1309 /* Branch around memcmp call. */
1310 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1311 JUMP_LABEL (j) = final_label;
1312 LABEL_NUSES (final_label) += 1;
1313 emit_barrier ();
1314
1315 /* Make memcmp library call. cmp_rem is the remaining bytes that
1316 were compared and cmp_rem is the expected amount to be compared
1317 by memcmp. If we don't find a difference in the loop compare, do
1318 the library call directly instead of doing a small compare just
1319 to get to an arbitrary boundary before calling it anyway.
1320 Also, update addresses to point to the next word to examine. */
1321 emit_label (library_call_label);
1322
1323 rtx len_rtx = gen_reg_rtx (word_mode);
1324 if (bytes_is_const)
1325 {
1326 emit_move_insn (len_rtx, cmp_rem);
1327 do_add3 (src1_addr, src1_addr, iv1);
1328 do_add3 (src2_addr, src2_addr, iv1);
1329 }
1330 else
1331 emit_move_insn (len_rtx, bytes_rtx);
1332
1333 tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1334 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1335 target, LCT_NORMAL, GET_MODE (target),
1336 src1_addr, Pmode,
1337 src2_addr, Pmode,
1338 len_rtx, GET_MODE (len_rtx));
1339 }
1340
1341 /* emit final_label */
1342 emit_label (final_label);
1343 return true;
1344}
1345
8845cb37
AS
1346/* Expand a block compare operation, and return true if successful.
1347 Return false if we should let the compiler generate normal code,
1348 probably a memcmp call.
1349
1350 OPERANDS[0] is the target (result).
1351 OPERANDS[1] is the first source.
1352 OPERANDS[2] is the second source.
1353 OPERANDS[3] is the length.
1354 OPERANDS[4] is the alignment. */
1355bool
1356expand_block_compare (rtx operands[])
1357{
1358 rtx target = operands[0];
1359 rtx orig_src1 = operands[1];
1360 rtx orig_src2 = operands[2];
1361 rtx bytes_rtx = operands[3];
1362 rtx align_rtx = operands[4];
1363 HOST_WIDE_INT cmp_bytes = 0;
1364 rtx src1 = orig_src1;
1365 rtx src2 = orig_src2;
1366
1367 /* This case is complicated to handle because the subtract
1368 with carry instructions do not generate the 64-bit
1369 carry and so we must emit code to calculate it ourselves.
1370 We choose not to implement this yet. */
1371 if (TARGET_32BIT && TARGET_POWERPC64)
1372 return false;
1373
5ec3397e
AS
1374 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
1375
1376 /* Allow this param to shut off all expansion. */
1377 if (rs6000_block_compare_inline_limit == 0)
1378 return false;
1379
1380 /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1381 However slow_unaligned_access returns true on P7 even though the
1382 performance of this code is good there. */
1383 if (!isP7
1384 && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1385 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
8845cb37
AS
1386 return false;
1387
5ec3397e
AS
1388 /* Unaligned l*brx traps on P7 so don't do this. However this should
1389 not affect much because LE isn't really supported on P7 anyway. */
1390 if (isP7 && !BYTES_BIG_ENDIAN)
1391 return false;
1392
1393 /* If this is not a fixed size compare, try generating loop code and
1394 if that fails just call memcmp. */
1395 if (!CONST_INT_P (bytes_rtx))
1396 return expand_compare_loop (operands);
1397
8845cb37
AS
1398 /* This must be a fixed size alignment. */
1399 if (!CONST_INT_P (align_rtx))
1400 return false;
1401
1402 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
1403
8845cb37
AS
1404 gcc_assert (GET_MODE (target) == SImode);
1405
1406 /* Anything to move? */
1407 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1408 if (bytes == 0)
1409 return true;
1410
8845cb37
AS
1411 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1412 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1413 /* P7/P8 code uses cond for subfc. but P9 uses
ef4adf1f 1414 it for cmpld which needs CCUNSmode. */
8845cb37
AS
1415 rtx cond;
1416 if (TARGET_P9_MISC)
1417 cond = gen_reg_rtx (CCUNSmode);
1418 else
1419 cond = gen_reg_rtx (CCmode);
1420
8845cb37
AS
1421 /* Strategy phase. How many ops will this take and should we expand it? */
1422
1423 unsigned HOST_WIDE_INT offset = 0;
1424 machine_mode load_mode =
74f9986e 1425 select_block_compare_mode (offset, bytes, base_align);
8845cb37
AS
1426 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
1427
5ec3397e
AS
1428 /* We don't want to generate too much code. The loop code can take
1429 over for lengths greater than 31 bytes. */
1430 unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
8845cb37 1431 if (!IN_RANGE (bytes, 1, max_bytes))
5ec3397e
AS
1432 return expand_compare_loop (operands);
1433
1434 /* The code generated for p7 and older is not faster than glibc
1435 memcmp if alignment is small and length is not short, so bail
1436 out to avoid those conditions. */
1437 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
1438 && ((base_align == 1 && bytes > 16)
1439 || (base_align == 2 && bytes > 32)))
8845cb37
AS
1440 return false;
1441
1442 bool generate_6432_conversion = false;
1443 rtx convert_label = NULL;
1444 rtx final_label = NULL;
1445
1446 /* Example of generated code for 18 bytes aligned 1 byte.
1447 Compiled with -fno-reorder-blocks for clarity.
1448 ldbrx 10,31,8
1449 ldbrx 9,7,8
1450 subfc. 9,9,10
1451 bne 0,.L6487
1452 addi 9,12,8
1453 addi 5,11,8
1454 ldbrx 10,0,9
1455 ldbrx 9,0,5
1456 subfc. 9,9,10
1457 bne 0,.L6487
1458 addi 9,12,16
1459 lhbrx 10,0,9
1460 addi 9,11,16
1461 lhbrx 9,0,9
1462 subf 9,9,10
1463 b .L6488
1464 .p2align 4,,15
1465 .L6487: #convert_label
1466 popcntd 9,9
1467 subfe 10,10,10
1468 or 9,9,10
1469 .L6488: #final_label
1470 extsw 10,9
1471
1472 We start off with DImode for two blocks that jump to the DI->SI conversion
1473 if the difference is found there, then a final block of HImode that skips
1474 the DI->SI conversion. */
1475
1476 while (bytes > 0)
1477 {
1478 unsigned int align = compute_current_alignment (base_align, offset);
74f9986e 1479 load_mode = select_block_compare_mode (offset, bytes, align);
8845cb37
AS
1480 load_mode_size = GET_MODE_SIZE (load_mode);
1481 if (bytes >= load_mode_size)
1482 cmp_bytes = load_mode_size;
1483 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1484 {
1485 /* Move this load back so it doesn't go past the end.
1486 P8/P9 can do this efficiently. */
1487 unsigned int extra_bytes = load_mode_size - bytes;
1488 cmp_bytes = bytes;
1489 if (extra_bytes < offset)
1490 {
1491 offset -= extra_bytes;
1492 cmp_bytes = load_mode_size;
1493 bytes = cmp_bytes;
1494 }
1495 }
1496 else
1497 /* P7 and earlier can't do the overlapping load trick fast,
1498 so this forces a non-overlapping load and a shift to get
1499 rid of the extra bytes. */
1500 cmp_bytes = bytes;
1501
1502 src1 = adjust_address (orig_src1, load_mode, offset);
1503 src2 = adjust_address (orig_src2, load_mode, offset);
1504
1505 if (!REG_P (XEXP (src1, 0)))
1506 {
1507 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1508 src1 = replace_equiv_address (src1, src1_reg);
1509 }
f4f867ca 1510 set_mem_size (src1, load_mode_size);
8845cb37
AS
1511
1512 if (!REG_P (XEXP (src2, 0)))
1513 {
1514 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1515 src2 = replace_equiv_address (src2, src2_reg);
1516 }
f4f867ca 1517 set_mem_size (src2, load_mode_size);
8845cb37
AS
1518
1519 do_load_for_compare (tmp_reg_src1, src1, load_mode);
1520 do_load_for_compare (tmp_reg_src2, src2, load_mode);
1521
1522 if (cmp_bytes < load_mode_size)
1523 {
1524 /* Shift unneeded bytes off. */
1525 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1526 if (word_mode == DImode)
1527 {
1528 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1529 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1530 }
1531 else
1532 {
1533 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1534 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1535 }
1536 }
1537
1538 int remain = bytes - cmp_bytes;
1539 if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
1540 {
1541 /* Target is larger than load size so we don't need to
1542 reduce result size. */
1543
1544 /* We previously did a block that need 64->32 conversion but
1545 the current block does not, so a label is needed to jump
1546 to the end. */
1547 if (generate_6432_conversion && !final_label)
1548 final_label = gen_label_rtx ();
1549
1550 if (remain > 0)
1551 {
1552 /* This is not the last block, branch to the end if the result
1553 of this subtract is not zero. */
1554 if (!final_label)
1555 final_label = gen_label_rtx ();
1556 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1557 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1558 rtx cr = gen_reg_rtx (CCmode);
1559 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
1560 emit_insn (gen_movsi (target,
1561 gen_lowpart (SImode, tmp_reg_src2)));
1562 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1563 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1564 fin_ref, pc_rtx);
1565 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1566 JUMP_LABEL (j) = final_label;
1567 LABEL_NUSES (final_label) += 1;
1568 }
1569 else
1570 {
1571 if (word_mode == DImode)
1572 {
1573 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1574 tmp_reg_src2));
1575 emit_insn (gen_movsi (target,
1576 gen_lowpart (SImode, tmp_reg_src2)));
1577 }
1578 else
1579 emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
1580
1581 if (final_label)
1582 {
1583 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1584 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
5ec3397e 1585 JUMP_LABEL (j) = final_label;
8845cb37
AS
1586 LABEL_NUSES (final_label) += 1;
1587 emit_barrier ();
1588 }
1589 }
1590 }
1591 else
1592 {
1593 /* Do we need a 64->32 conversion block? We need the 64->32
1594 conversion even if target size == load_mode size because
1595 the subtract generates one extra bit. */
1596 generate_6432_conversion = true;
1597
1598 if (remain > 0)
1599 {
1600 if (!convert_label)
1601 convert_label = gen_label_rtx ();
1602
1603 /* Compare to zero and branch to convert_label if not zero. */
1604 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1605 if (TARGET_P9_MISC)
1606 {
1607 /* Generate a compare, and convert with a setb later. */
1608 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1609 tmp_reg_src2);
1610 emit_insn (gen_rtx_SET (cond, cmp));
1611 }
1612 else
1613 /* Generate a subfc. and use the longer
1614 sequence for conversion. */
1615 if (TARGET_64BIT)
1616 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1617 tmp_reg_src1, cond));
1618 else
1619 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1620 tmp_reg_src1, cond));
1621 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1622 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1623 cvt_ref, pc_rtx);
1624 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
5ec3397e 1625 JUMP_LABEL (j) = convert_label;
8845cb37
AS
1626 LABEL_NUSES (convert_label) += 1;
1627 }
1628 else
1629 {
1630 /* Just do the subtract/compare. Since this is the last block
1631 the convert code will be generated immediately following. */
1632 if (TARGET_P9_MISC)
1633 {
1634 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1635 tmp_reg_src2);
1636 emit_insn (gen_rtx_SET (cond, cmp));
1637 }
1638 else
1639 if (TARGET_64BIT)
1640 emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
1641 tmp_reg_src1));
1642 else
1643 emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
1644 tmp_reg_src1));
1645 }
1646 }
1647
1648 offset += cmp_bytes;
1649 bytes -= cmp_bytes;
1650 }
1651
1652 if (generate_6432_conversion)
1653 {
1654 if (convert_label)
1655 emit_label (convert_label);
1656
1657 /* We need to produce DI result from sub, then convert to target SI
ef4adf1f 1658 while maintaining <0 / ==0 / >0 properties. This sequence works:
8845cb37
AS
1659 subfc L,A,B
1660 subfe H,H,H
1661 popcntd L,L
1662 rldimi L,H,6,0
1663
1664 This is an alternate one Segher cooked up if somebody
1665 wants to expand this for something that doesn't have popcntd:
1666 subfc L,a,b
1667 subfe H,x,x
1668 addic t,L,-1
1669 subfe v,t,L
1670 or z,v,H
1671
1672 And finally, p9 can just do this:
1673 cmpld A,B
1674 setb r */
1675
1676 if (TARGET_P9_MISC)
1677 {
1678 emit_insn (gen_setb_unsigned (target, cond));
1679 }
1680 else
1681 {
1682 if (TARGET_64BIT)
1683 {
1684 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1685 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1686 emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
1687 emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
1688 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
1689 }
1690 else
1691 {
1692 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1693 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1694 emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
1695 emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
1696 }
1697 }
1698 }
1699
1700 if (final_label)
1701 emit_label (final_label);
1702
1703 gcc_assert (bytes == 0);
1704 return true;
1705}
1706
f7e94dfb 1707/* Generate page crossing check and branch code to set up for
8845cb37
AS
1708 strncmp when we don't have DI alignment.
1709 STRNCMP_LABEL is the label to branch if there is a page crossing.
f7e94dfb 1710 SRC_ADDR is the string address to be examined.
8845cb37
AS
1711 BYTES is the max number of bytes to compare. */
1712static void
f7e94dfb 1713expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes)
8845cb37
AS
1714{
1715 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
f7e94dfb
AS
1716 rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr));
1717 do_and3 (src_pgoff, src_addr, GEN_INT (0xfff));
8845cb37 1718 rtx cond = gen_reg_rtx (CCmode);
f7e94dfb 1719 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff,
8845cb37
AS
1720 GEN_INT (4096 - bytes)));
1721
0c791c59 1722 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
8845cb37
AS
1723
1724 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
0c791c59 1725 lab_ref, pc_rtx);
8845cb37
AS
1726 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1727 JUMP_LABEL (j) = strncmp_label;
1728 LABEL_NUSES (strncmp_label) += 1;
1729}
1730
74f9986e
AS
1731/* Generate the sequence of compares for strcmp/strncmp using gpr instructions.
1732 BYTES_TO_COMPARE is the number of bytes to be compared.
1733 BASE_ALIGN is the smaller of the alignment of the two strings.
1734 ORIG_SRC1 is the unmodified rtx for the first string.
1735 ORIG_SRC2 is the unmodified rtx for the second string.
1736 TMP_REG_SRC1 is the register for loading the first string.
1737 TMP_REG_SRC2 is the register for loading the second string.
1738 RESULT_REG is the rtx for the result register.
1739 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
1740 to strcmp/strncmp if we have equality at the end of the inline comparison.
9d36bd3b
AS
1741 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
1742 to clean up and generate the final comparison result.
ef4adf1f 1743 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
74f9986e
AS
1744 set the final result. */
1745static void
9d36bd3b
AS
1746expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
1747 unsigned int base_align,
1748 rtx orig_src1, rtx orig_src2,
1749 rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
1750 bool equality_compare_rest, rtx *p_cleanup_label,
1751 rtx final_move_label)
74f9986e
AS
1752{
1753 unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
1754 machine_mode load_mode;
1755 unsigned int load_mode_size;
1756 unsigned HOST_WIDE_INT cmp_bytes = 0;
1757 unsigned HOST_WIDE_INT offset = 0;
1758 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
1759 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
9d36bd3b
AS
1760 gcc_assert (p_cleanup_label != NULL);
1761 rtx cleanup_label = *p_cleanup_label;
74f9986e
AS
1762
1763 while (bytes_to_compare > 0)
1764 {
1765 /* GPR compare sequence:
ef4adf1f
AS
1766 check each 8B with: ld/ld/cmpb/cmpb/orc./bne
1767
74f9986e 1768 cleanup code at end:
74f9986e
AS
1769 cntlzd get bit of first zero/diff byte
1770 subfic convert for rldcl use
1771 rldcl rldcl extract diff/zero byte
1772 subf subtract for final result
1773
1774 The last compare can branch around the cleanup code if the
1775 result is zero because the strings are exactly equal. */
ef4adf1f 1776
74f9986e
AS
1777 unsigned int align = compute_current_alignment (base_align, offset);
1778 load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
1779 load_mode_size = GET_MODE_SIZE (load_mode);
1780 if (bytes_to_compare >= load_mode_size)
1781 cmp_bytes = load_mode_size;
1782 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1783 {
1784 /* Move this load back so it doesn't go past the end.
1785 P8/P9 can do this efficiently. */
1786 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
1787 cmp_bytes = bytes_to_compare;
1788 if (extra_bytes < offset)
1789 {
1790 offset -= extra_bytes;
1791 cmp_bytes = load_mode_size;
1792 bytes_to_compare = cmp_bytes;
1793 }
1794 }
1795 else
1796 /* P7 and earlier can't do the overlapping load trick fast,
1797 so this forces a non-overlapping load and a shift to get
1798 rid of the extra bytes. */
1799 cmp_bytes = bytes_to_compare;
1800
ef4adf1f
AS
1801 rtx offset_reg = gen_reg_rtx (Pmode);
1802 emit_move_insn (offset_reg, GEN_INT (offset));
1803
1804 rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_reg);
74f9986e 1805 do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
ef4adf1f 1806 rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_reg);
74f9986e
AS
1807 do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
1808
1809 /* We must always left-align the data we read, and
1810 clear any bytes to the right that are beyond the string.
1811 Otherwise the cmpb sequence won't produce the correct
ef4adf1f
AS
1812 results. However if there is only one byte left, we
1813 can just subtract to get the final result so the shifts
1814 and clears are not needed. */
74f9986e 1815
ef4adf1f 1816 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
74f9986e 1817
ef4adf1f
AS
1818 /* Loading just a single byte is a special case. If we are
1819 loading more than that, we have to check whether we are
1820 looking at the entire chunk of data. If not, rotate left and
1821 clear right so that bytes we aren't supposed to look at are
1822 zeroed, and the first byte we are supposed to compare is
1823 leftmost. */
1824 if (load_mode_size != 1)
74f9986e 1825 {
ef4adf1f
AS
1826 if (load_mode_size < word_mode_size)
1827 {
1828 /* Rotate left first. */
1829 rtx sh = GEN_INT (BITS_PER_UNIT
1830 * (word_mode_size - load_mode_size));
1831 do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
1832 do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
1833 }
1834
1835 if (cmp_bytes < word_mode_size)
1836 {
1837 /* Now clear right. This plus the rotate can be
1838 turned into a rldicr instruction. */
1839 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
1840 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
1841 do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
1842 do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
1843 }
74f9986e
AS
1844 }
1845
1846 /* Cases to handle. A and B are chunks of the two strings.
1847 1: Not end of comparison:
1848 A != B: branch to cleanup code to compute result.
1849 A == B: check for 0 byte, next block if not found.
1850 2: End of the inline comparison:
1851 A != B: branch to cleanup code to compute result.
1852 A == B: check for 0 byte, call strcmp/strncmp
1853 3: compared requested N bytes:
1854 A == B: branch to result 0.
1855 A != B: cleanup code to compute result. */
1856
74f9986e
AS
1857 rtx dst_label;
1858 if (remain > 0 || equality_compare_rest)
1859 {
1860 /* Branch to cleanup code, otherwise fall through to do
1861 more compares. */
1862 if (!cleanup_label)
1863 cleanup_label = gen_label_rtx ();
1864 dst_label = cleanup_label;
1865 }
1866 else
1867 /* Branch to end and produce result of 0. */
1868 dst_label = final_move_label;
1869
ef4adf1f
AS
1870 if (load_mode_size == 1)
1871 {
1872 /* Special case for comparing just single byte. */
1873 if (equality_compare_rest)
1874 {
1875 /* Use subf./bne to branch to final_move_label if the
1876 byte differs, otherwise fall through to the strncmp
1877 call. We must also check for a zero byte here as we
1878 must not make the library call if this is the end of
1879 the string. */
1880
1881 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
1882 rtx cond = gen_reg_rtx (CCmode);
1883 rtx diff_rtx = gen_rtx_MINUS (word_mode,
1884 tmp_reg_src1, tmp_reg_src2);
1885 rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond);
1886 rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1887
1888 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
1889 lab_ref, pc_rtx);
1890 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1891 JUMP_LABEL (j) = final_move_label;
1892 LABEL_NUSES (final_move_label) += 1;
74f9986e 1893
ef4adf1f
AS
1894 /* Check for zero byte here before fall through to
1895 library call. This catches the case where the
1896 strings are equal and end in a zero byte at this
1897 position. */
74f9986e 1898
ef4adf1f
AS
1899 rtx cond0 = gen_reg_rtx (CCmode);
1900 emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1,
1901 const0_rtx));
74f9986e 1902
ef4adf1f 1903 rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx);
74f9986e 1904
ef4adf1f
AS
1905 rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx,
1906 lab_ref, pc_rtx);
1907 rtx j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0));
1908 JUMP_LABEL (j0) = final_move_label;
1909 LABEL_NUSES (final_move_label) += 1;
1910 }
1911 else
1912 {
1913 /* This is the last byte to be compared so we can use
1914 subf to compute the final result and branch
1915 unconditionally to final_move_label. */
1916
1917 do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2);
1918
1919 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
1920 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1921 JUMP_LABEL (j) = final_move_label;
1922 LABEL_NUSES (final_move_label) += 1;
1923 emit_barrier ();
1924 }
1925 }
1926 else
74f9986e 1927 {
74f9986e 1928 rtx cmpb_zero = gen_reg_rtx (word_mode);
ef4adf1f 1929 rtx cmpb_diff = gen_reg_rtx (word_mode);
74f9986e 1930 rtx zero_reg = gen_reg_rtx (word_mode);
ef4adf1f
AS
1931 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
1932 rtx cond = gen_reg_rtx (CCmode);
1933
74f9986e 1934 emit_move_insn (zero_reg, GEN_INT (0));
ef4adf1f 1935 do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2);
74f9986e 1936 do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
ef4adf1f
AS
1937 rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff);
1938 rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero);
74f9986e 1939
ef4adf1f 1940 rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond);
74f9986e 1941
ef4adf1f
AS
1942 rtx cmp_rtx;
1943 if (remain == 0 && !equality_compare_rest)
1944 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
1945 else
1946 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
74f9986e 1947
ef4adf1f
AS
1948 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
1949 lab_ref, pc_rtx);
1950 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1951 JUMP_LABEL (j) = dst_label;
1952 LABEL_NUSES (dst_label) += 1;
74f9986e
AS
1953 }
1954
1955 offset += cmp_bytes;
1956 bytes_to_compare -= cmp_bytes;
1957 }
1958
9d36bd3b
AS
1959 *p_cleanup_label = cleanup_label;
1960 return;
1961}
1962
ef4adf1f 1963/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
9d36bd3b
AS
1964 instructions.
1965
1966 BYTES_TO_COMPARE is the number of bytes to be compared.
1967 ORIG_SRC1 is the unmodified rtx for the first string.
1968 ORIG_SRC2 is the unmodified rtx for the second string.
1969 S1ADDR is the register to use for the base address of the first string.
1970 S2ADDR is the register to use for the base address of the second string.
1971 OFF_REG is the register to use for the string offset for loads.
1972 S1DATA is the register for loading the first string.
1973 S2DATA is the register for loading the second string.
1974 VEC_RESULT is the rtx for the vector result indicating the byte difference.
1975 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
1976 to strcmp/strncmp if we have equality at the end of the inline comparison.
1977 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code to clean up
1978 and generate the final comparison result.
ef4adf1f 1979 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
9d36bd3b
AS
1980 set the final result. */
1981static void
1982expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
1983 rtx orig_src1, rtx orig_src2,
1984 rtx s1addr, rtx s2addr, rtx off_reg,
1985 rtx s1data, rtx s2data,
1986 rtx vec_result, bool equality_compare_rest,
1987 rtx *p_cleanup_label, rtx final_move_label)
1988{
1989 machine_mode load_mode;
1990 unsigned int load_mode_size;
1991 unsigned HOST_WIDE_INT cmp_bytes = 0;
1992 unsigned HOST_WIDE_INT offset = 0;
1993
1994 gcc_assert (p_cleanup_label != NULL);
1995 rtx cleanup_label = *p_cleanup_label;
1996
1997 emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0)));
1998 emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0)));
1999
2000 unsigned int i;
2001 rtx zr[16];
2002 for (i = 0; i < 16; i++)
2003 zr[i] = GEN_INT (0);
2004 rtvec zv = gen_rtvec_v (16, zr);
2005 rtx zero_reg = gen_reg_rtx (V16QImode);
2006 rs6000_expand_vector_init (zero_reg, gen_rtx_PARALLEL (V16QImode, zv));
2007
2008 while (bytes_to_compare > 0)
2009 {
2010 /* VEC/VSX compare sequence for P8:
2011 check each 16B with:
2012 lxvd2x 32,28,8
2013 lxvd2x 33,29,8
2014 vcmpequb 2,0,1 # compare strings
2015 vcmpequb 4,0,3 # compare w/ 0
2016 xxlorc 37,36,34 # first FF byte is either mismatch or end of string
2017 vcmpequb. 7,5,3 # reg 7 contains 0
2018 bnl 6,.Lmismatch
2019
2020 For the P8 LE case, we use lxvd2x and compare full 16 bytes
2021 but then use use vgbbd and a shift to get two bytes with the
2022 information we need in the correct order.
2023
2024 VEC/VSX compare sequence if TARGET_P9_VECTOR:
2025 lxvb16x/lxvb16x # load 16B of each string
2026 vcmpnezb. # produces difference location or zero byte location
2027 bne 6,.Lmismatch
2028
2029 Use the overlapping compare trick for the last block if it is
ef4adf1f 2030 less than 16 bytes.
9d36bd3b
AS
2031 */
2032
2033 load_mode = V16QImode;
2034 load_mode_size = GET_MODE_SIZE (load_mode);
ef4adf1f 2035
9d36bd3b
AS
2036 if (bytes_to_compare >= load_mode_size)
2037 cmp_bytes = load_mode_size;
2038 else
2039 {
2040 /* Move this load back so it doesn't go past the end. P8/P9
2041 can do this efficiently. This is never called with less
2042 than 16 bytes so we should always be able to do this. */
2043 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
2044 cmp_bytes = bytes_to_compare;
2045 gcc_assert (offset > extra_bytes);
2046 offset -= extra_bytes;
2047 cmp_bytes = load_mode_size;
2048 bytes_to_compare = cmp_bytes;
2049 }
2050
2051 /* The offset currently used is always kept in off_reg so that the
2052 cleanup code on P8 can use it to extract the differing byte. */
2053 emit_move_insn (off_reg, GEN_INT (offset));
2054
2055 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
2056 do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1);
2057 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
2058 do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2);
2059
2060 /* Cases to handle. A and B are chunks of the two strings.
2061 1: Not end of comparison:
2062 A != B: branch to cleanup code to compute result.
2063 A == B: next block
2064 2: End of the inline comparison:
2065 A != B: branch to cleanup code to compute result.
2066 A == B: call strcmp/strncmp
2067 3: compared requested N bytes:
2068 A == B: branch to result 0.
2069 A != B: cleanup code to compute result. */
2070
2071 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
2072
2073 if (TARGET_P9_VECTOR)
2074 emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data));
2075 else
2076 {
2077 /* Emit instructions to do comparison and zero check. */
2078 rtx cmp_res = gen_reg_rtx (load_mode);
2079 rtx cmp_zero = gen_reg_rtx (load_mode);
2080 rtx cmp_combined = gen_reg_rtx (load_mode);
2081 emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data));
2082 emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg));
2083 emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res));
2084 emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg));
2085 }
2086
2087 bool branch_to_cleanup = (remain > 0 || equality_compare_rest);
2088 rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO);
2089 rtx dst_label;
2090 rtx cmp_rtx;
2091 if (branch_to_cleanup)
2092 {
2093 /* Branch to cleanup code, otherwise fall through to do more
ef4adf1f 2094 compares. P8 and P9 use different CR bits because on P8
9d36bd3b
AS
2095 we are looking at the result of a comparsion vs a
2096 register of zeroes so the all-true condition means no
ef4adf1f 2097 difference or zero was found. On P9, vcmpnezb sets a byte
9d36bd3b
AS
2098 to 0xff if there is a mismatch or zero, so the all-false
2099 condition indicates we found no difference or zero. */
2100 if (!cleanup_label)
2101 cleanup_label = gen_label_rtx ();
2102 dst_label = cleanup_label;
2103 if (TARGET_P9_VECTOR)
2104 cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx);
2105 else
2106 cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx);
2107 }
2108 else
2109 {
ef4adf1f 2110 /* Branch to final return or fall through to cleanup,
9d36bd3b
AS
2111 result is already set to 0. */
2112 dst_label = final_move_label;
2113 if (TARGET_P9_VECTOR)
2114 cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx);
2115 else
2116 cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx);
2117 }
2118
2119 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
2120 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2121 lab_ref, pc_rtx);
2122 rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2123 JUMP_LABEL (j2) = dst_label;
2124 LABEL_NUSES (dst_label) += 1;
2125
2126 offset += cmp_bytes;
2127 bytes_to_compare -= cmp_bytes;
2128 }
2129 *p_cleanup_label = cleanup_label;
2130 return;
74f9986e
AS
2131}
2132
f7e94dfb
AS
2133/* Generate the final sequence that identifies the differing
2134 byte and generates the final result, taking into account
2135 zero bytes:
ef4adf1f 2136
f7e94dfb
AS
2137 cntlzd get bit of first zero/diff byte
2138 addi convert for rldcl use
2139 rldcl rldcl extract diff/zero byte
2140 subf subtract for final result
2141
2142 STR1 is the reg rtx for data from string 1.
2143 STR2 is the reg rtx for data from string 2.
2144 RESULT is the reg rtx for the comparison result. */
2145
2146static void
2147emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
2148{
2149 machine_mode m = GET_MODE (str1);
f7e94dfb 2150 rtx rot_amt = gen_reg_rtx (m);
f7e94dfb
AS
2151
2152 rtx rot1_1 = gen_reg_rtx (m);
2153 rtx rot1_2 = gen_reg_rtx (m);
2154 rtx rot2_1 = gen_reg_rtx (m);
2155 rtx rot2_2 = gen_reg_rtx (m);
2156
2157 if (m == SImode)
2158 {
ef4adf1f 2159 emit_insn (gen_clzsi2 (rot_amt, result));
f7e94dfb
AS
2160 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
2161 emit_insn (gen_rotlsi3 (rot1_1, str1,
2162 gen_lowpart (SImode, rot_amt)));
2163 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2164 emit_insn (gen_rotlsi3 (rot2_1, str2,
2165 gen_lowpart (SImode, rot_amt)));
2166 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2167 emit_insn (gen_subsi3 (result, rot1_2, rot2_2));
2168 }
2169 else if (m == DImode)
2170 {
ef4adf1f 2171 emit_insn (gen_clzdi2 (rot_amt, result));
f7e94dfb
AS
2172 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
2173 emit_insn (gen_rotldi3 (rot1_1, str1,
2174 gen_lowpart (SImode, rot_amt)));
2175 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2176 emit_insn (gen_rotldi3 (rot2_1, str2,
2177 gen_lowpart (SImode, rot_amt)));
2178 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2179 emit_insn (gen_subdi3 (result, rot1_2, rot2_2));
2180 }
2181 else
2182 gcc_unreachable ();
ef4adf1f 2183
f7e94dfb
AS
2184 return;
2185}
2186
9d36bd3b
AS
2187/* Generate the final sequence that identifies the differing
2188 byte and generates the final result, taking into account
2189 zero bytes:
2190
2191 P8:
2192 vgbbd 0,0
2193 vsldoi 0,0,0,9
2194 mfvsrd 9,32
2195 addi 10,9,-1 # count trailing zero bits
2196 andc 9,10,9
2197 popcntd 9,9
2198 lbzx 10,28,9 # use that offset to load differing byte
2199 lbzx 3,29,9
2200 subf 3,3,10 # subtract for final result
ef4adf1f 2201
9d36bd3b
AS
2202 P9:
2203 vclzlsbb # counts trailing bytes with lsb=0
ef4adf1f 2204 vextublx # extract differing byte
9d36bd3b
AS
2205
2206 STR1 is the reg rtx for data from string 1.
2207 STR2 is the reg rtx for data from string 2.
2208 RESULT is the reg rtx for the comparison result.
2209 S1ADDR is the register to use for the base address of the first string.
2210 S2ADDR is the register to use for the base address of the second string.
2211 ORIG_SRC1 is the unmodified rtx for the first string.
2212 ORIG_SRC2 is the unmodified rtx for the second string.
2213 OFF_REG is the register to use for the string offset for loads.
2214 VEC_RESULT is the rtx for the vector result indicating the byte difference.
2215 */
2216
2217static void
2218emit_final_str_compare_vec (rtx str1, rtx str2, rtx result,
2219 rtx s1addr, rtx s2addr,
2220 rtx orig_src1, rtx orig_src2,
2221 rtx off_reg, rtx vec_result)
2222{
2223 if (TARGET_P9_VECTOR)
2224 {
2225 rtx diffix = gen_reg_rtx (SImode);
2226 rtx chr1 = gen_reg_rtx (SImode);
2227 rtx chr2 = gen_reg_rtx (SImode);
2228 rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0);
2229 rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0);
2230 emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result));
2231 emit_insn (gen_vextublx (chr1, diffix, str1));
2232 emit_insn (gen_vextublx (chr2, diffix, str2));
2233 do_sub3 (result, chr1_di, chr2_di);
2234 }
2235 else
2236 {
6bd2b8ec 2237 gcc_assert (TARGET_P8_VECTOR);
9d36bd3b
AS
2238 rtx diffix = gen_reg_rtx (DImode);
2239 rtx result_gbbd = gen_reg_rtx (V16QImode);
ef4adf1f 2240 /* Since each byte of the input is either 00 or FF, the bytes in
9d36bd3b
AS
2241 dw0 and dw1 after vgbbd are all identical to each other. */
2242 emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result));
2243 /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
2244 For BE, we shift by 7 and get AB in the high two bytes then CLZ. */
2245 rtx result_shifted = gen_reg_rtx (V16QImode);
2246 int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9;
2247 emit_insn (gen_altivec_vsldoi_v16qi (result_shifted,result_gbbd,result_gbbd, GEN_INT (shift_amt)));
2248
2249 rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0);
2250 emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted));
2251 rtx count = gen_reg_rtx (DImode);
2252
2253 if (BYTES_BIG_ENDIAN)
2254 emit_insn (gen_clzdi2 (count, diffix));
2255 else
2256 emit_insn (gen_ctzdi2 (count, diffix));
2257
ef4adf1f 2258 /* P8 doesn't have a good solution for extracting one byte from
9d36bd3b
AS
2259 a vsx reg like vextublx on P9 so we just compute the offset
2260 of the differing byte and load it from each string. */
2261 do_add3 (off_reg, off_reg, count);
2262
2263 rtx chr1 = gen_reg_rtx (QImode);
2264 rtx chr2 = gen_reg_rtx (QImode);
2265 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
2266 do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1);
2267 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
2268 do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2);
2269 machine_mode rmode = GET_MODE (result);
2270 rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0);
2271 rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0);
2272 do_sub3 (result, chr1_rm, chr2_rm);
2273 }
2274
2275 return;
2276}
2277
8845cb37 2278/* Expand a string compare operation with length, and return
ef4adf1f 2279 true if successful. Return false if we should let the
8845cb37
AS
2280 compiler generate normal code, probably a strncmp call.
2281
2282 OPERANDS[0] is the target (result).
2283 OPERANDS[1] is the first source.
2284 OPERANDS[2] is the second source.
2285 If NO_LENGTH is zero, then:
2286 OPERANDS[3] is the length.
2287 OPERANDS[4] is the alignment in bytes.
2288 If NO_LENGTH is nonzero, then:
2289 OPERANDS[3] is the alignment in bytes. */
2290bool
2291expand_strn_compare (rtx operands[], int no_length)
2292{
2293 rtx target = operands[0];
2294 rtx orig_src1 = operands[1];
2295 rtx orig_src2 = operands[2];
2296 rtx bytes_rtx, align_rtx;
2297 if (no_length)
2298 {
2299 bytes_rtx = NULL;
2300 align_rtx = operands[3];
2301 }
2302 else
2303 {
2304 bytes_rtx = operands[3];
2305 align_rtx = operands[4];
2306 }
74f9986e 2307
f7e94dfb
AS
2308 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2309 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
8845cb37 2310
ef4adf1f 2311 /* If we have a length, it must be constant. This simplifies things
8845cb37 2312 a bit as we don't have to generate code to check if we've exceeded
ef4adf1f 2313 the length. Later this could be expanded to handle this case. */
8845cb37
AS
2314 if (!no_length && !CONST_INT_P (bytes_rtx))
2315 return false;
2316
2317 /* This must be a fixed size alignment. */
2318 if (!CONST_INT_P (align_rtx))
2319 return false;
2320
2321 unsigned int base_align = UINTVAL (align_rtx);
f7e94dfb
AS
2322 unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
2323 unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
8845cb37 2324
e0bd6c9f
RS
2325 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
2326 if (targetm.slow_unaligned_access (word_mode, align1)
2327 || targetm.slow_unaligned_access (word_mode, align2))
8845cb37
AS
2328 return false;
2329
2330 gcc_assert (GET_MODE (target) == SImode);
2331
9d36bd3b 2332 unsigned int required_align = 8;
8845cb37
AS
2333
2334 unsigned HOST_WIDE_INT offset = 0;
2335 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
2336 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
9d36bd3b 2337
8845cb37 2338 if (no_length)
9d36bd3b 2339 bytes = rs6000_string_compare_inline_limit;
8845cb37
AS
2340 else
2341 bytes = UINTVAL (bytes_rtx);
2342
ef4adf1f 2343 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
9d36bd3b
AS
2344 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2345 at least POWER8. That way we can rely on overlapping compares to
6bd2b8ec
AS
2346 do the final comparison of less than 16 bytes. Also I do not
2347 want to deal with making this work for 32 bits. In addition, we
2348 have to make sure that we have at least P8_VECTOR (we don't allow
2349 P9_VECTOR without P8_VECTOR). */
2350 int use_vec = (bytes >= 16 && !TARGET_32BIT
2351 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
9d36bd3b
AS
2352
2353 if (use_vec)
2354 required_align = 16;
2355
2356 machine_mode load_mode;
2357 rtx tmp_reg_src1, tmp_reg_src2;
2358 if (use_vec)
2359 {
2360 load_mode = V16QImode;
2361 tmp_reg_src1 = gen_reg_rtx (V16QImode);
2362 tmp_reg_src2 = gen_reg_rtx (V16QImode);
2363 }
2364 else
2365 {
2366 load_mode = select_block_compare_mode (0, bytes, base_align);
2367 tmp_reg_src1 = gen_reg_rtx (word_mode);
2368 tmp_reg_src2 = gen_reg_rtx (word_mode);
2369 }
2370
2371 compare_length = rs6000_string_compare_inline_limit;
8845cb37
AS
2372
2373 /* If we have equality at the end of the last compare and we have not
2374 found the end of the string, we need to call strcmp/strncmp to
2375 compare the remainder. */
2376 bool equality_compare_rest = false;
2377
2378 if (no_length)
2379 {
2380 bytes = compare_length;
2381 equality_compare_rest = true;
2382 }
2383 else
2384 {
2385 if (bytes <= compare_length)
2386 compare_length = bytes;
2387 else
2388 equality_compare_rest = true;
2389 }
2390
2391 rtx result_reg = gen_reg_rtx (word_mode);
2392 rtx final_move_label = gen_label_rtx ();
2393 rtx final_label = gen_label_rtx ();
2394 rtx begin_compare_label = NULL;
ef4adf1f 2395
f7e94dfb 2396 if (base_align < required_align)
8845cb37
AS
2397 {
2398 /* Generate code that checks distance to 4k boundary for this case. */
2399 begin_compare_label = gen_label_rtx ();
2400 rtx strncmp_label = gen_label_rtx ();
2401 rtx jmp;
2402
2403 /* Strncmp for power8 in glibc does this:
5ec3397e
AS
2404 rldicl r8,r3,0,52
2405 cmpldi cr7,r8,4096-16
2406 bgt cr7,L(pagecross) */
8845cb37
AS
2407
2408 /* Make sure that the length we use for the alignment test and
2409 the subsequent code generation are in agreement so we do not
2410 go past the length we tested for a 4k boundary crossing. */
2411 unsigned HOST_WIDE_INT align_test = compare_length;
9d36bd3b 2412 if (align_test < required_align)
8845cb37
AS
2413 {
2414 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
2415 base_align = align_test;
2416 }
2417 else
2418 {
f7e94dfb
AS
2419 align_test = ROUND_UP (align_test, required_align);
2420 base_align = required_align;
8845cb37
AS
2421 }
2422
f7e94dfb
AS
2423 if (align1 < required_align)
2424 expand_strncmp_align_check (strncmp_label, src1_addr, align_test);
2425 if (align2 < required_align)
2426 expand_strncmp_align_check (strncmp_label, src2_addr, align_test);
8845cb37
AS
2427
2428 /* Now generate the following sequence:
2429 - branch to begin_compare
2430 - strncmp_label
2431 - call to strncmp
2432 - branch to final_label
2433 - begin_compare_label */
2434
2435 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
2436 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
2437 JUMP_LABEL (jmp) = begin_compare_label;
2438 LABEL_NUSES (begin_compare_label) += 1;
2439 emit_barrier ();
2440
2441 emit_label (strncmp_label);
2442
8845cb37
AS
2443 if (no_length)
2444 {
2445 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2446 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2447 target, LCT_NORMAL, GET_MODE (target),
f7e94dfb
AS
2448 force_reg (Pmode, src1_addr), Pmode,
2449 force_reg (Pmode, src2_addr), Pmode);
8845cb37
AS
2450 }
2451 else
2452 {
2453 /* -m32 -mpowerpc64 results in word_mode being DImode even
9d36bd3b 2454 though otherwise it is 32-bit. The length arg to strncmp
8845cb37 2455 is a size_t which will be the same size as pointers. */
e9727bda
AS
2456 rtx len_rtx = gen_reg_rtx (Pmode);
2457 emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
8845cb37
AS
2458
2459 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2460 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2461 target, LCT_NORMAL, GET_MODE (target),
f7e94dfb
AS
2462 force_reg (Pmode, src1_addr), Pmode,
2463 force_reg (Pmode, src2_addr), Pmode,
e9727bda 2464 len_rtx, Pmode);
8845cb37
AS
2465 }
2466
2467 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2468 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2469 JUMP_LABEL (jmp) = final_label;
2470 LABEL_NUSES (final_label) += 1;
2471 emit_barrier ();
2472 emit_label (begin_compare_label);
2473 }
2474
2475 rtx cleanup_label = NULL;
9d36bd3b 2476 rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL;
8845cb37 2477
f7e94dfb 2478 /* Generate a sequence of GPR or VEC/VSX instructions to compare out
8845cb37 2479 to the length specified. */
9d36bd3b
AS
2480 if (use_vec)
2481 {
2482 s1addr = gen_reg_rtx (Pmode);
2483 s2addr = gen_reg_rtx (Pmode);
2484 off_reg = gen_reg_rtx (Pmode);
2485 vec_result = gen_reg_rtx (load_mode);
2486 emit_move_insn (result_reg, GEN_INT (0));
2487 expand_strncmp_vec_sequence (compare_length,
2488 orig_src1, orig_src2,
2489 s1addr, s2addr, off_reg,
2490 tmp_reg_src1, tmp_reg_src2,
2491 vec_result,
2492 equality_compare_rest,
2493 &cleanup_label, final_move_label);
2494 }
2495 else
2496 expand_strncmp_gpr_sequence (compare_length, base_align,
2497 orig_src1, orig_src2,
2498 tmp_reg_src1, tmp_reg_src2,
2499 result_reg,
2500 equality_compare_rest,
2501 &cleanup_label, final_move_label);
74f9986e
AS
2502
2503 offset = compare_length;
ef4adf1f 2504
8845cb37
AS
2505 if (equality_compare_rest)
2506 {
2507 /* Update pointers past what has been compared already. */
f7e94dfb
AS
2508 rtx src1 = force_reg (Pmode,
2509 gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset)));
2510 rtx src2 = force_reg (Pmode,
2511 gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset)));
8845cb37
AS
2512
2513 /* Construct call to strcmp/strncmp to compare the rest of the string. */
2514 if (no_length)
2515 {
2516 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2517 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2518 target, LCT_NORMAL, GET_MODE (target),
f7e94dfb 2519 src1, Pmode, src2, Pmode);
8845cb37
AS
2520 }
2521 else
2522 {
e9727bda
AS
2523 rtx len_rtx = gen_reg_rtx (Pmode);
2524 emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode));
8845cb37
AS
2525 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2526 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2527 target, LCT_NORMAL, GET_MODE (target),
e9727bda 2528 src1, Pmode, src2, Pmode, len_rtx, Pmode);
8845cb37
AS
2529 }
2530
2531 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2532 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2533 JUMP_LABEL (jmp) = final_label;
2534 LABEL_NUSES (final_label) += 1;
2535 emit_barrier ();
2536 }
2537
2538 if (cleanup_label)
2539 emit_label (cleanup_label);
2540
9d36bd3b
AS
2541 if (use_vec)
2542 emit_final_str_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg,
2543 s1addr, s2addr, orig_src1, orig_src2,
2544 off_reg, vec_result);
2545 else
2546 emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
8845cb37
AS
2547
2548 emit_label (final_move_label);
2549 emit_insn (gen_movsi (target,
2550 gen_lowpart (SImode, result_reg)));
2551 emit_label (final_label);
2552 return true;
2553}
2554
2555/* Expand a block move operation, and return 1 if successful. Return 0
2556 if we should let the compiler generate normal code.
2557
2558 operands[0] is the destination
2559 operands[1] is the source
2560 operands[2] is the length
2561 operands[3] is the alignment */
2562
2563#define MAX_MOVE_REG 4
2564
2565int
2566expand_block_move (rtx operands[])
2567{
2568 rtx orig_dest = operands[0];
2569 rtx orig_src = operands[1];
2570 rtx bytes_rtx = operands[2];
2571 rtx align_rtx = operands[3];
2572 int constp = (GET_CODE (bytes_rtx) == CONST_INT);
2573 int align;
2574 int bytes;
2575 int offset;
2576 int move_bytes;
2577 rtx stores[MAX_MOVE_REG];
2578 int num_reg = 0;
2579
2580 /* If this is not a fixed size move, just call memcpy */
2581 if (! constp)
2582 return 0;
2583
2584 /* This must be a fixed size alignment */
2585 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
2586 align = INTVAL (align_rtx) * BITS_PER_UNIT;
2587
2588 /* Anything to move? */
2589 bytes = INTVAL (bytes_rtx);
2590 if (bytes <= 0)
2591 return 1;
2592
2593 if (bytes > rs6000_block_move_inline_limit)
2594 return 0;
2595
2596 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2597 {
2598 union {
2599 rtx (*movmemsi) (rtx, rtx, rtx, rtx);
2600 rtx (*mov) (rtx, rtx);
2601 } gen_func;
2602 machine_mode mode = BLKmode;
2603 rtx src, dest;
2604
2605 /* Altivec first, since it will be faster than a string move
2606 when it applies, and usually not significantly larger. */
3b0cb1a5 2607 if (TARGET_ALTIVEC && bytes >= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX || align >= 128))
8845cb37
AS
2608 {
2609 move_bytes = 16;
2610 mode = V4SImode;
2611 gen_func.mov = gen_movv4si;
2612 }
8845cb37
AS
2613 else if (bytes >= 8 && TARGET_POWERPC64
2614 && (align >= 64 || !STRICT_ALIGNMENT))
2615 {
2616 move_bytes = 8;
2617 mode = DImode;
2618 gen_func.mov = gen_movdi;
2619 if (offset == 0 && align < 64)
2620 {
2621 rtx addr;
2622
2623 /* If the address form is reg+offset with offset not a
2624 multiple of four, reload into reg indirect form here
2625 rather than waiting for reload. This way we get one
2626 reload, not one per load and/or store. */
2627 addr = XEXP (orig_dest, 0);
2628 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2629 && GET_CODE (XEXP (addr, 1)) == CONST_INT
2630 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2631 {
2632 addr = copy_addr_to_reg (addr);
2633 orig_dest = replace_equiv_address (orig_dest, addr);
2634 }
2635 addr = XEXP (orig_src, 0);
2636 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2637 && GET_CODE (XEXP (addr, 1)) == CONST_INT
2638 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2639 {
2640 addr = copy_addr_to_reg (addr);
2641 orig_src = replace_equiv_address (orig_src, addr);
2642 }
2643 }
2644 }
8845cb37
AS
2645 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2646 { /* move 4 bytes */
2647 move_bytes = 4;
2648 mode = SImode;
2649 gen_func.mov = gen_movsi;
2650 }
2651 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2652 { /* move 2 bytes */
2653 move_bytes = 2;
2654 mode = HImode;
2655 gen_func.mov = gen_movhi;
2656 }
8845cb37
AS
2657 else /* move 1 byte at a time */
2658 {
2659 move_bytes = 1;
2660 mode = QImode;
2661 gen_func.mov = gen_movqi;
2662 }
2663
2664 src = adjust_address (orig_src, mode, offset);
2665 dest = adjust_address (orig_dest, mode, offset);
2666
2667 if (mode != BLKmode)
2668 {
2669 rtx tmp_reg = gen_reg_rtx (mode);
2670
2671 emit_insn ((*gen_func.mov) (tmp_reg, src));
2672 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2673 }
2674
2675 if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
2676 {
2677 int i;
2678 for (i = 0; i < num_reg; i++)
2679 emit_insn (stores[i]);
2680 num_reg = 0;
2681 }
2682
2683 if (mode == BLKmode)
2684 {
2685 /* Move the address into scratch registers. The movmemsi
2686 patterns require zero offset. */
2687 if (!REG_P (XEXP (src, 0)))
2688 {
2689 rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
2690 src = replace_equiv_address (src, src_reg);
2691 }
2692 set_mem_size (src, move_bytes);
2693
2694 if (!REG_P (XEXP (dest, 0)))
2695 {
2696 rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
2697 dest = replace_equiv_address (dest, dest_reg);
2698 }
2699 set_mem_size (dest, move_bytes);
2700
2701 emit_insn ((*gen_func.movmemsi) (dest, src,
2702 GEN_INT (move_bytes & 31),
2703 align_rtx));
2704 }
2705 }
2706
2707 return 1;
2708}