]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/rs6000/rs6000-string.c
re PR fortran/85111 (ICE in min_max_choose, at fortran/simplify.c:4884 (and others))
[thirdparty/gcc.git] / gcc / config / rs6000 / rs6000-string.c
CommitLineData
8845cb37
AS
1/* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
85ec4feb 3 Copyright (C) 1991-2018 Free Software Foundation, Inc.
8845cb37
AS
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
8845cb37
AS
23#include "config.h"
24#include "system.h"
25#include "coretypes.h"
26#include "backend.h"
27#include "rtl.h"
28#include "tree.h"
29#include "memmodel.h"
30#include "tm_p.h"
31#include "ira.h"
32#include "print-tree.h"
33#include "varasm.h"
34#include "explow.h"
35#include "expr.h"
36#include "output.h"
e0bd6c9f 37#include "target.h"
8845cb37
AS
38
39/* Expand a block clear operation, and return 1 if successful. Return 0
40 if we should let the compiler generate normal code.
41
42 operands[0] is the destination
43 operands[1] is the length
44 operands[3] is the alignment */
45
46int
47expand_block_clear (rtx operands[])
48{
49 rtx orig_dest = operands[0];
50 rtx bytes_rtx = operands[1];
51 rtx align_rtx = operands[3];
52 bool constp = (GET_CODE (bytes_rtx) == CONST_INT);
53 HOST_WIDE_INT align;
54 HOST_WIDE_INT bytes;
55 int offset;
56 int clear_bytes;
57 int clear_step;
58
59 /* If this is not a fixed size move, just call memcpy */
60 if (! constp)
61 return 0;
62
63 /* This must be a fixed size alignment */
64 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
65 align = INTVAL (align_rtx) * BITS_PER_UNIT;
66
67 /* Anything to clear? */
68 bytes = INTVAL (bytes_rtx);
69 if (bytes <= 0)
70 return 1;
71
72 /* Use the builtin memset after a point, to avoid huge code bloat.
73 When optimize_size, avoid any significant code bloat; calling
74 memset is about 4 instructions, so allow for one instruction to
75 load zero and three to do clearing. */
3b0cb1a5 76 if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
8845cb37
AS
77 clear_step = 16;
78 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
79 clear_step = 8;
80 else
81 clear_step = 4;
82
83 if (optimize_size && bytes > 3 * clear_step)
84 return 0;
85 if (! optimize_size && bytes > 8 * clear_step)
86 return 0;
87
88 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
89 {
90 machine_mode mode = BLKmode;
91 rtx dest;
92
3b0cb1a5 93 if (bytes >= 16 && TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
8845cb37
AS
94 {
95 clear_bytes = 16;
96 mode = V4SImode;
97 }
98 else if (bytes >= 8 && TARGET_POWERPC64
99 && (align >= 64 || !STRICT_ALIGNMENT))
100 {
101 clear_bytes = 8;
102 mode = DImode;
103 if (offset == 0 && align < 64)
104 {
105 rtx addr;
106
107 /* If the address form is reg+offset with offset not a
108 multiple of four, reload into reg indirect form here
109 rather than waiting for reload. This way we get one
110 reload, not one per store. */
111 addr = XEXP (orig_dest, 0);
112 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
113 && GET_CODE (XEXP (addr, 1)) == CONST_INT
114 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
115 {
116 addr = copy_addr_to_reg (addr);
117 orig_dest = replace_equiv_address (orig_dest, addr);
118 }
119 }
120 }
121 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
122 { /* move 4 bytes */
123 clear_bytes = 4;
124 mode = SImode;
125 }
126 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
127 { /* move 2 bytes */
128 clear_bytes = 2;
129 mode = HImode;
130 }
131 else /* move 1 byte at a time */
132 {
133 clear_bytes = 1;
134 mode = QImode;
135 }
136
137 dest = adjust_address (orig_dest, mode, offset);
138
139 emit_move_insn (dest, CONST0_RTX (mode));
140 }
141
142 return 1;
143}
144
145/* Figure out the correct instructions to generate to load data for
146 block compare. MODE is used for the read from memory, and
147 data is zero extended if REG is wider than MODE. If LE code
148 is being generated, bswap loads are used.
149
150 REG is the destination register to move the data into.
151 MEM is the memory block being read.
152 MODE is the mode of memory to use for the read. */
153static void
154do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
155{
156 switch (GET_MODE (reg))
157 {
4e10a5a7 158 case E_DImode:
8845cb37
AS
159 switch (mode)
160 {
4e10a5a7 161 case E_QImode:
8845cb37
AS
162 emit_insn (gen_zero_extendqidi2 (reg, mem));
163 break;
4e10a5a7 164 case E_HImode:
8845cb37
AS
165 {
166 rtx src = mem;
167 if (!BYTES_BIG_ENDIAN)
168 {
169 src = gen_reg_rtx (HImode);
170 emit_insn (gen_bswaphi2 (src, mem));
171 }
172 emit_insn (gen_zero_extendhidi2 (reg, src));
173 break;
174 }
4e10a5a7 175 case E_SImode:
8845cb37
AS
176 {
177 rtx src = mem;
178 if (!BYTES_BIG_ENDIAN)
179 {
180 src = gen_reg_rtx (SImode);
181 emit_insn (gen_bswapsi2 (src, mem));
182 }
183 emit_insn (gen_zero_extendsidi2 (reg, src));
184 }
185 break;
4e10a5a7 186 case E_DImode:
8845cb37
AS
187 if (!BYTES_BIG_ENDIAN)
188 emit_insn (gen_bswapdi2 (reg, mem));
189 else
190 emit_insn (gen_movdi (reg, mem));
191 break;
192 default:
193 gcc_unreachable ();
194 }
195 break;
196
4e10a5a7 197 case E_SImode:
8845cb37
AS
198 switch (mode)
199 {
4e10a5a7 200 case E_QImode:
8845cb37
AS
201 emit_insn (gen_zero_extendqisi2 (reg, mem));
202 break;
4e10a5a7 203 case E_HImode:
8845cb37
AS
204 {
205 rtx src = mem;
206 if (!BYTES_BIG_ENDIAN)
207 {
208 src = gen_reg_rtx (HImode);
209 emit_insn (gen_bswaphi2 (src, mem));
210 }
211 emit_insn (gen_zero_extendhisi2 (reg, src));
212 break;
213 }
4e10a5a7 214 case E_SImode:
8845cb37
AS
215 if (!BYTES_BIG_ENDIAN)
216 emit_insn (gen_bswapsi2 (reg, mem));
217 else
218 emit_insn (gen_movsi (reg, mem));
219 break;
4e10a5a7 220 case E_DImode:
8845cb37
AS
221 /* DImode is larger than the destination reg so is not expected. */
222 gcc_unreachable ();
223 break;
224 default:
225 gcc_unreachable ();
226 }
227 break;
228 default:
229 gcc_unreachable ();
230 break;
231 }
232}
233
234/* Select the mode to be used for reading the next chunk of bytes
235 in the compare.
236
237 OFFSET is the current read offset from the beginning of the block.
238 BYTES is the number of bytes remaining to be read.
239 ALIGN is the minimum alignment of the memory blocks being compared in bytes.
240 WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
241 the largest allowable mode. */
242static machine_mode
243select_block_compare_mode (unsigned HOST_WIDE_INT offset,
244 unsigned HOST_WIDE_INT bytes,
245 unsigned HOST_WIDE_INT align, bool word_mode_ok)
246{
247 /* First see if we can do a whole load unit
248 as that will be more efficient than a larger load + shift. */
249
250 /* If big, use biggest chunk.
251 If exactly chunk size, use that size.
252 If remainder can be done in one piece with shifting, do that.
253 Do largest chunk possible without violating alignment rules. */
254
255 /* The most we can read without potential page crossing. */
256 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
257
258 if (word_mode_ok && bytes >= UNITS_PER_WORD)
259 return word_mode;
260 else if (bytes == GET_MODE_SIZE (SImode))
261 return SImode;
262 else if (bytes == GET_MODE_SIZE (HImode))
263 return HImode;
264 else if (bytes == GET_MODE_SIZE (QImode))
265 return QImode;
266 else if (bytes < GET_MODE_SIZE (SImode)
267 && offset >= GET_MODE_SIZE (SImode) - bytes)
268 /* This matches the case were we have SImode and 3 bytes
269 and offset >= 1 and permits us to move back one and overlap
270 with the previous read, thus avoiding having to shift
271 unwanted bytes off of the input. */
272 return SImode;
273 else if (word_mode_ok && bytes < UNITS_PER_WORD
274 && offset >= UNITS_PER_WORD-bytes)
275 /* Similarly, if we can use DImode it will get matched here and
276 can do an overlapping read that ends at the end of the block. */
277 return word_mode;
278 else if (word_mode_ok && maxread >= UNITS_PER_WORD)
279 /* It is safe to do all remaining in one load of largest size,
280 possibly with a shift to get rid of unwanted bytes. */
281 return word_mode;
282 else if (maxread >= GET_MODE_SIZE (SImode))
283 /* It is safe to do all remaining in one SImode load,
284 possibly with a shift to get rid of unwanted bytes. */
285 return SImode;
286 else if (bytes > GET_MODE_SIZE (SImode))
287 return SImode;
288 else if (bytes > GET_MODE_SIZE (HImode))
289 return HImode;
290
291 /* final fallback is do one byte */
292 return QImode;
293}
294
295/* Compute the alignment of pointer+OFFSET where the original alignment
296 of pointer was BASE_ALIGN. */
297static unsigned HOST_WIDE_INT
298compute_current_alignment (unsigned HOST_WIDE_INT base_align,
299 unsigned HOST_WIDE_INT offset)
300{
301 if (offset == 0)
302 return base_align;
303 return MIN (base_align, offset & -offset);
304}
305
5ec3397e
AS
306/* Prepare address and then do a load.
307
308 MODE is the mode to use for the load.
309 DEST is the destination register for the data.
310 ADDR is the address to be loaded.
311 ORIG_ADDR is the original address expression. */
312static void
313do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
314 rtx orig_addr)
315{
316 rtx mem = gen_rtx_MEM (mode, addr);
317 MEM_COPY_ATTRIBUTES (mem, orig_addr);
318 set_mem_size (mem, GET_MODE_SIZE (mode));
319 do_load_for_compare (dest, mem, mode);
320 return;
321}
322
323/* Do a branch for an if/else decision.
324
325 CMPMODE is the mode to use for the comparison.
326 COMPARISON is the rtx code for the compare needed.
327 A is the first thing to be compared.
328 B is the second thing to be compared.
329 CR is the condition code reg input, or NULL_RTX.
330 TRUE_LABEL is the label to branch to if the condition is true.
331
332 The return value is the CR used for the comparison.
333 If CR is null_rtx, then a new register of CMPMODE is generated.
334 If A and B are both null_rtx, then CR must not be null, and the
335 compare is not generated so you can use this with a dot form insn. */
336
337static void
338do_ifelse (machine_mode cmpmode, rtx_code comparison,
339 rtx a, rtx b, rtx cr, rtx true_label)
340{
341 gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
342 || (a != NULL_RTX && b != NULL_RTX));
343
344 if (cr != NULL_RTX)
345 gcc_assert (GET_MODE (cr) == cmpmode);
346 else
347 cr = gen_reg_rtx (cmpmode);
348
349 rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
350
351 if (a != NULL_RTX)
352 emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
353
354 rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
355
356 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
357 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
358 JUMP_LABEL (j) = true_label;
359 LABEL_NUSES (true_label) += 1;
360}
361
362/* Emit an isel of the proper mode for DEST.
363
364 DEST is the isel destination register.
365 SRC1 is the isel source if CR is true.
366 SRC2 is the isel source if CR is false.
367 CR is the condition for the isel. */
368static void
369do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
370{
371 if (GET_MODE (dest) == DImode)
372 emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
373 else
374 emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
375}
376
377/* Emit a subtract of the proper mode for DEST.
378
379 DEST is the destination register for the subtract.
380 SRC1 is the first subtract input.
381 SRC2 is the second subtract input.
382
383 Computes DEST = SRC1-SRC2. */
384static void
385do_sub3 (rtx dest, rtx src1, rtx src2)
386{
387 if (GET_MODE (dest) == DImode)
388 emit_insn (gen_subdi3 (dest, src1, src2));
389 else
390 emit_insn (gen_subsi3 (dest, src1, src2));
391}
392
393/* Emit an add of the proper mode for DEST.
394
395 DEST is the destination register for the add.
396 SRC1 is the first add input.
397 SRC2 is the second add input.
398
399 Computes DEST = SRC1+SRC2. */
400static void
401do_add3 (rtx dest, rtx src1, rtx src2)
402{
403 if (GET_MODE (dest) == DImode)
404 emit_insn (gen_adddi3 (dest, src1, src2));
405 else
406 emit_insn (gen_addsi3 (dest, src1, src2));
407}
408
409/* Generate rtl for a load, shift, and compare of less than a full word.
410
411 LOAD_MODE is the machine mode for the loads.
412 DIFF is the reg for the difference.
413 CMP_REM is the reg containing the remaining bytes to compare.
414 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
415 SRC1_ADDR is the first source address.
416 SRC2_ADDR is the second source address.
417 ORIG_SRC1 is the original first source block's address rtx.
418 ORIG_SRC2 is the original second source block's address rtx. */
419static void
420do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
421 rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
422{
423 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
424 rtx shift_amount = gen_reg_rtx (word_mode);
425 rtx d1 = gen_reg_rtx (word_mode);
426 rtx d2 = gen_reg_rtx (word_mode);
427
428 do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
429 do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
430 do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
431
432 if (word_mode == DImode)
433 {
434 emit_insn (gen_ashldi3 (shift_amount, shift_amount,
435 GEN_INT (LOG2_BITS_PER_UNIT)));
436 emit_insn (gen_lshrdi3 (d1, d1,
437 gen_lowpart (SImode, shift_amount)));
438 emit_insn (gen_lshrdi3 (d2, d2,
439 gen_lowpart (SImode, shift_amount)));
440 }
441 else
442 {
443 emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
444 GEN_INT (LOG2_BITS_PER_UNIT)));
445 emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
446 emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
447 }
448
449 if (TARGET_P9_MISC)
450 {
451 /* Generate a compare, and convert with a setb later. */
452 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
453 emit_insn (gen_rtx_SET (dcond, cmp));
454 }
455 else
456 {
457 if (word_mode == DImode)
458 emit_insn (gen_subfdi3_carry (diff, d2, d1));
459 else
460 emit_insn (gen_subfsi3_carry (diff, d2, d1));
461 }
462}
463
464/* Generate rtl for an overlapping load and compare of less than a
465 full load_mode. This assumes that the previous word is part of the
466 block being compared so it's ok to back up part of a word so we can
467 compare the last unaligned full word that ends at the end of the block.
468
469 LOAD_MODE is the machine mode for the loads.
470 ISCONST tells whether the remaining length is a constant or in a register.
471 BYTES_REM is the remaining length if ISCONST is true.
472 DIFF is the reg for the difference.
473 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
474 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
475 SRC1_ADDR is the first source address.
476 SRC2_ADDR is the second source address.
477 ORIG_SRC1 is the original first source block's address rtx.
478 ORIG_SRC2 is the original second source block's address rtx. */
479static void
480do_overlap_load_compare (machine_mode load_mode, bool isConst,
481 HOST_WIDE_INT bytes_rem, rtx diff,
482 rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
483 rtx orig_src1, rtx orig_src2)
484{
485 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
486 HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
487 rtx d1 = gen_reg_rtx (word_mode);
488 rtx d2 = gen_reg_rtx (word_mode);
489
490 rtx addr1, addr2;
491 if (!isConst || addr_adj)
492 {
493 rtx adj_reg = gen_reg_rtx (word_mode);
494 if (isConst)
495 emit_move_insn (adj_reg, GEN_INT (-addr_adj));
496 else
497 {
498 rtx reg_lms = gen_reg_rtx (word_mode);
499 emit_move_insn (reg_lms, GEN_INT (load_mode_size));
500 do_sub3 (adj_reg, cmp_rem, reg_lms);
501 }
502
503 addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
504 addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
505 }
506 else
507 {
508 addr1 = src1_addr;
509 addr2 = src2_addr;
510 }
511
512 do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
513 do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
514
515 if (TARGET_P9_MISC)
516 {
517 /* Generate a compare, and convert with a setb later. */
518 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
519 emit_insn (gen_rtx_SET (dcond, cmp));
520 }
521 else
522 {
523 if (word_mode == DImode)
524 emit_insn (gen_subfdi3_carry (diff, d2, d1));
525 else
526 emit_insn (gen_subfsi3_carry (diff, d2, d1));
527 }
528}
529
530/* Expand a block compare operation using loop code, and return true
531 if successful. Return false if we should let the compiler generate
532 normal code, probably a memcmp call.
533
534 OPERANDS[0] is the target (result).
535 OPERANDS[1] is the first source.
536 OPERANDS[2] is the second source.
537 OPERANDS[3] is the length.
538 OPERANDS[4] is the alignment. */
539bool
540expand_compare_loop (rtx operands[])
541{
542 rtx target = operands[0];
543 rtx orig_src1 = operands[1];
544 rtx orig_src2 = operands[2];
545 rtx bytes_rtx = operands[3];
546 rtx align_rtx = operands[4];
547
548 /* This case is complicated to handle because the subtract
549 with carry instructions do not generate the 64-bit
550 carry and so we must emit code to calculate it ourselves.
551 We choose not to implement this yet. */
552 if (TARGET_32BIT && TARGET_POWERPC64)
553 return false;
554
555 /* Allow non-const length. */
556 int bytes_is_const = CONST_INT_P (bytes_rtx);
557
558 /* This must be a fixed size alignment. */
559 if (!CONST_INT_P (align_rtx))
560 return false;
561
562 HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
563 HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
564 HOST_WIDE_INT minalign = MIN (align1, align2);
565
566 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
567
568 gcc_assert (GET_MODE (target) == SImode);
569
570 /* Anything to move? */
571 HOST_WIDE_INT bytes = 0;
572 if (bytes_is_const)
573 bytes = INTVAL (bytes_rtx);
574
575 if (bytes_is_const && bytes == 0)
576 return true;
577
578 /* Limit the amount we compare, if known statically. */
579 HOST_WIDE_INT max_bytes;
580 switch (rs6000_tune)
581 {
582 case PROCESSOR_POWER7:
583 if (!bytes_is_const)
584 if (minalign < 8)
585 max_bytes = 0;
586 else
587 max_bytes = 128;
588 else
589 if (minalign < 8)
590 max_bytes = 32;
591 else
592 max_bytes = 128;
593 break;
594 case PROCESSOR_POWER8:
595 if (!bytes_is_const)
596 max_bytes = 0;
597 else
598 if (minalign < 8)
599 max_bytes = 128;
600 else
601 max_bytes = 64;
602 break;
603 case PROCESSOR_POWER9:
604 if (bytes_is_const)
605 max_bytes = 191;
606 else
607 max_bytes = 0;
608 break;
609 default:
610 max_bytes = 128;
611 }
612
613 /* Allow the option to override the default. */
614 if (rs6000_block_compare_inline_loop_limit >= 0)
615 max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
616
617 if (max_bytes == 0)
618 return false;
619
620 rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */
621 rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */
622 HOST_WIDE_INT niter;
623 rtx iter = gen_reg_rtx (word_mode);
624 rtx iv1 = gen_reg_rtx (word_mode);
625 rtx iv2 = gen_reg_rtx (word_mode);
626 rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */
627 rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */
628 rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */
629 rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */
630
631 /* Strip unneeded subreg from length if there is one. */
632 if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
633 bytes_rtx = SUBREG_REG (bytes_rtx);
634 /* Extend bytes_rtx to word_mode if needed. But, we expect only to
635 maybe have to deal with the case were bytes_rtx is SImode and
636 word_mode is DImode. */
637 if (!bytes_is_const)
638 {
639 if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
640 /* Do not expect length longer than word_mode. */
641 return false;
642 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
643 {
644 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
645 bytes_rtx = force_reg (word_mode,
646 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
647 bytes_rtx));
648 }
649 else
650 /* Make sure it's in a register before we get started. */
651 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
652 }
653
654 machine_mode load_mode = word_mode;
655 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
656
657 /* Number of bytes per iteration of the unrolled loop. */
658 HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
659 /* max iters and bytes compared in the loop. */
660 HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
661 HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
662 int l2lb = floor_log2 (loop_bytes);
663
664 if (bytes_is_const && (max_bytes < load_mode_size
665 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
666 return false;
667
668 bool no_remainder_code = false;
669 rtx final_label = gen_label_rtx ();
670 rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
671 rtx diff_label = gen_label_rtx ();
672 rtx library_call_label = NULL;
673 rtx cleanup_label = gen_label_rtx ();
674
675 rtx cr;
676
677 rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
678 rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
679
680 /* Difference found is stored here before jump to diff_label. */
681 rtx diff = gen_reg_rtx (word_mode);
682 rtx j;
683
684 /* Example of generated code for 35 bytes aligned 1 byte.
685
686 mtctr 8
687 li 6,0
688 li 5,8
689 .L13:
690 ldbrx 7,3,6
691 ldbrx 9,10,6
692 ldbrx 0,3,5
693 ldbrx 4,10,5
694 addi 6,6,16
695 addi 5,5,16
696 subfc. 9,9,7
697 bne 0,.L10
698 subfc. 9,4,0
699 bdnzt 2,.L13
700 bne 0,.L10
701 add 3,3,6
702 add 10,10,6
703 addi 9,3,-5
704 ldbrx 7,0,9
705 addi 9,10,-5
706 ldbrx 9,0,9
707 subfc 9,9,7
708 .p2align 4,,15
709 .L10:
710 popcntd 9,9
711 subfe 10,10,10
712 or 9,9,10
713
714 Compiled with -fno-reorder-blocks for clarity. */
715
716 /* Structure of what we're going to do:
717 Two separate lengths: what we will compare before bailing to library
718 call (max_bytes), and the total length to be checked.
719 if length <= 16, branch to linear cleanup code starting with
720 remainder length check (length not known at compile time)
721 set up 2 iv's and load count reg, compute remainder length
722 unrollx2 compare loop
723 if loop exit due to a difference, branch to difference handling code
724 if remainder length < 8, branch to final cleanup compare
725 load and compare 8B
726 final cleanup comparison (depends on alignment and length)
727 load 8B, shift off bytes past length, compare
728 load 8B ending at last byte and compare
729 load/compare 1 byte at a time (short block abutting 4k boundary)
730 difference handling, 64->32 conversion
731 final result
732 branch around memcmp call
733 memcmp library call
734 */
735
736 /* If bytes is not const, compare length and branch directly
737 to the cleanup code that can handle 0-16 bytes if length
738 is >= 16. Stash away bytes-max_bytes for the library call. */
739 if (bytes_is_const)
740 {
741 /* These need to be set for some of the places we may jump to. */
742 if (bytes > max_bytes)
743 {
744 no_remainder_code = true;
745 niter = max_loop_iter;
746 library_call_label = gen_label_rtx ();
747 }
748 else
749 {
750 niter = bytes / loop_bytes;
751 }
752 emit_move_insn (iter, GEN_INT (niter));
753 emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
754 emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
755 }
756 else
757 {
758 library_call_label = gen_label_rtx ();
759
760 /* If we go to the cleanup code, it expects length to be in cmp_rem. */
761 emit_move_insn (cmp_rem, bytes_rtx);
762
763 /* Check for > max_bytes bytes. We want to bail out as quickly as
764 possible if we have to go over to memcmp. */
765 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
766 NULL_RTX, library_call_label);
767
768 /* Check for < loop_bytes bytes. */
769 do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
770 NULL_RTX, cleanup_label);
771
772 /* Loop compare bytes and iterations if bytes>max_bytes. */
773 rtx mb_reg = gen_reg_rtx (word_mode);
774 emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
775 rtx mi_reg = gen_reg_rtx (word_mode);
776 emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
777
778 /* Compute number of loop iterations if bytes <= max_bytes. */
779 if (word_mode == DImode)
780 emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
781 else
782 emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
783
784 /* Compute bytes to compare in loop if bytes <= max_bytes. */
785 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
786 if (word_mode == DImode)
787 {
788 emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
789 }
790 else
791 {
792 emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
793 }
794
795 /* Check for bytes <= max_bytes. */
796 if (TARGET_ISEL)
797 {
798 /* P9 has fast isel so we use one compare and two isel. */
799 cr = gen_reg_rtx (CCmode);
800 rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
801 GEN_INT (max_bytes));
802 emit_move_insn (cr, compare_rtx);
803 rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
804 do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
805 do_isel (iter, cmp_rtx, iter, mi_reg, cr);
806 }
807 else
808 {
809 rtx lab_after = gen_label_rtx ();
810 do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
811 NULL_RTX, lab_after);
812 emit_move_insn (loop_cmp, mb_reg);
813 emit_move_insn (iter, mi_reg);
814 emit_label (lab_after);
815 }
816
817 /* Now compute remainder bytes which isn't used until after the loop. */
818 do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
819 }
820
821 rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */
822 /* For p9 we need to have just one of these as multiple places define
823 it and it gets used by the setb at the end. */
824 if (TARGET_P9_MISC)
825 dcond = gen_reg_rtx (CCUNSmode);
826
827 if (!bytes_is_const || bytes >= loop_bytes)
828 {
829 /* It should not be possible to come here if remaining bytes is
830 < 16 in the runtime case either. Compute number of loop
831 iterations. We compare 2*word_mode per iteration so 16B for
832 64-bit code and 8B for 32-bit. Set up two induction
833 variables and load count register. */
834
835 /* HACK ALERT: create hard reg for CTR here. If we just use a
836 pseudo, cse will get rid of it and then the allocator will
837 see it used in the lshr above and won't give us ctr. */
838 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
839 emit_move_insn (ctr, iter);
840 emit_move_insn (diff, GEN_INT (0));
841 emit_move_insn (iv1, GEN_INT (0));
842 emit_move_insn (iv2, GEN_INT (load_mode_size));
843
844 /* inner loop to compare 2*word_mode */
845 rtx loop_top_label = gen_label_rtx ();
846 emit_label (loop_top_label);
847
848 rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
849 rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
850
851 do_load_for_compare_from_addr (load_mode, d1_1,
852 src1_ix1, orig_src1);
853 do_load_for_compare_from_addr (load_mode, d2_1,
854 src2_ix1, orig_src2);
855 do_add3 (iv1, iv1, GEN_INT (loop_bytes));
856
857 rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
858 rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
859
860 do_load_for_compare_from_addr (load_mode, d1_2,
861 src1_ix2, orig_src1);
862 do_load_for_compare_from_addr (load_mode, d2_2,
863 src2_ix2, orig_src2);
864 do_add3 (iv2, iv2, GEN_INT (loop_bytes));
865
866 if (TARGET_P9_MISC)
867 {
868 /* Generate a compare, and convert with a setb later. */
869 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
870 emit_insn (gen_rtx_SET (dcond, cmp));
871 }
872 else
873 {
874 dcond = gen_reg_rtx (CCmode);
875 if (word_mode == DImode)
876 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
877 else
878 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
879 }
880
881 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
882 dcond, diff_label);
883
884 if (TARGET_P9_MISC)
885 {
886 /* Generate a compare, and convert with a setb later. */
887 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
888 emit_insn (gen_rtx_SET (dcond, cmp));
889 }
890 else
891 {
892 dcond = gen_reg_rtx (CCmode);
893 if (word_mode == DImode)
894 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
895 else
896 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
897 }
898
899 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
900 if (TARGET_64BIT)
901 j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
902 eqrtx, dcond));
903 else
904 j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
905 eqrtx, dcond));
906 JUMP_LABEL (j) = loop_top_label;
907 LABEL_NUSES (loop_top_label) += 1;
908 }
909
910 HOST_WIDE_INT bytes_remaining = 0;
911 if (bytes_is_const)
912 bytes_remaining = (bytes % loop_bytes);
913
914 /* If diff is nonzero, branch to difference handling
915 code. If we exit here with a nonzero diff, it is
916 because the second word differed. */
917 if (TARGET_P9_MISC)
918 do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, diff_label);
919 else
920 do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, diff_label);
921
922 if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
923 {
924 /* If the length is known at compile time, then we will always
925 have a remainder to go to the library call with. */
926 rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
927 j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
928 JUMP_LABEL (j) = library_call_label;
929 LABEL_NUSES (library_call_label) += 1;
930 emit_barrier ();
931 }
932
933 if (bytes_is_const && bytes_remaining == 0)
934 {
935 /* No remainder and if we are here then diff is 0 so just return 0 */
936 if (TARGET_64BIT)
937 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
938 else
939 emit_move_insn (target, diff);
940 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
941 JUMP_LABEL (j) = final_label;
942 LABEL_NUSES (final_label) += 1;
943 emit_barrier ();
944 }
945 else if (!no_remainder_code)
946 {
947 /* Update addresses to point to the next word to examine. */
948 do_add3 (src1_addr, src1_addr, iv1);
949 do_add3 (src2_addr, src2_addr, iv1);
950
951 emit_label (cleanup_label);
952
953 if (!bytes_is_const)
954 {
955 /* If we're dealing with runtime length, we have to check if
956 it's zero after the loop. When length is known at compile
957 time the no-remainder condition is dealt with above. By
958 doing this after cleanup_label, we also deal with the
959 case where length is 0 at the start and we bypass the
960 loop with a branch to cleanup_label. */
961 emit_move_insn (target, const0_rtx);
962 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
963 NULL_RTX, final_label);
964 }
965
966 rtx final_cleanup = gen_label_rtx ();
967 rtx cmp_rem_before = gen_reg_rtx (word_mode);
968 /* Compare one more word_mode chunk if needed. */
969 if (!bytes_is_const
970 || (bytes_is_const && bytes_remaining >= load_mode_size))
971 {
972 /* If remainder length < word length, branch to final
973 cleanup compare. */
974 if (!bytes_is_const)
975 do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
976 NULL_RTX, final_cleanup);
977
978 /* load and compare 8B */
979 do_load_for_compare_from_addr (load_mode, d1_1,
980 src1_addr, orig_src1);
981 do_load_for_compare_from_addr (load_mode, d2_1,
982 src2_addr, orig_src2);
983
984 /* Compare the word, see if we need to do the last partial. */
985 if (TARGET_P9_MISC)
986 {
987 /* Generate a compare, and convert with a setb later. */
988 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
989 emit_insn (gen_rtx_SET (dcond, cmp));
990 }
991 else
992 {
993 dcond = gen_reg_rtx (CCmode);
994 if (word_mode == DImode)
995 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
996 else
997 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
998 }
999
1000 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1001 dcond, diff_label);
1002
1003 do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1004 do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1005 emit_move_insn (cmp_rem_before, cmp_rem);
1006 do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1007 if (bytes_is_const)
1008 bytes_remaining -= load_mode_size;
1009 else
1010 /* See if remaining length is now zero. We previously set
1011 target to 0 so we can just jump to the end. */
1012 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1013 NULL_RTX, final_label);
1014
1015 }
1016
1017 /* Cases:
1018 bytes_is_const
1019 We can always shift back to do an overlapping compare
1020 of the last chunk because we know length >= 8.
1021
1022 !bytes_is_const
1023 align>=load_mode_size
1024 Read word_mode and mask
1025 align<load_mode_size
1026 avoid stepping past end
1027
1028 Three strategies:
1029 * decrement address and do overlapping compare
1030 * read word_mode and mask
1031 * carefully avoid crossing 4k boundary
1032 */
1033
1034 if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1035 && align1 >= load_mode_size && align2 >= load_mode_size)
1036 {
1037 /* Alignment is larger than word_mode so we do not need to be
1038 concerned with extra page crossings. But, we do not know
1039 that the length is larger than load_mode_size so we might
1040 end up compareing against data before the block if we try
1041 an overlapping compare. Also we use this on P7 for fixed length
1042 remainder because P7 doesn't like overlapping unaligned.
1043 Strategy: load 8B, shift off bytes past length, and compare. */
1044 emit_label (final_cleanup);
1045 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1046 src1_addr, src2_addr, orig_src1, orig_src2);
1047 }
1048 else if (bytes_remaining && bytes_is_const)
1049 {
1050 /* We do not do loop expand if length < 32 so we know at the
1051 end we can do an overlapping compare.
1052 Strategy: shift address back and do word_mode load that
1053 ends at the end of the block. */
1054 emit_label (final_cleanup);
1055 do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1056 cmp_rem, dcond, src1_addr, src2_addr,
1057 orig_src1, orig_src2);
1058 }
1059 else if (!bytes_is_const)
1060 {
1061 rtx handle4k_label = gen_label_rtx ();
1062 rtx nonconst_overlap = gen_label_rtx ();
1063 emit_label (nonconst_overlap);
1064
1065 /* Here we have to handle the case where whe have runtime
1066 length which may be too short for overlap compare, and
1067 alignment is not at least load_mode_size so we have to
1068 tread carefully to avoid stepping across 4k boundaries. */
1069
1070 /* If the length after the loop was larger than word_mode
1071 size, we can just do an overlapping compare and we're
1072 done. We fall through to this code from the word_mode
1073 compare that preceeds this. */
1074 do_overlap_load_compare (load_mode, false, 0, diff,
1075 cmp_rem, dcond, src1_addr, src2_addr,
1076 orig_src1, orig_src2);
1077
1078 rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1079 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1080 JUMP_LABEL (j) = diff_label;
1081 LABEL_NUSES (diff_label) += 1;
1082 emit_barrier ();
1083
1084 /* If we couldn't do the overlap compare we have to be more
1085 careful of the 4k boundary. Test to see if either
1086 address is less than word_mode_size away from a 4k
1087 boundary. If not, then we can do a load/shift/compare
1088 and we are done. We come to this code if length was less
1089 than word_mode_size. */
1090
1091 emit_label (final_cleanup);
1092
1093 /* We can still avoid the slow case if the length was larger
1094 than one loop iteration, in which case go do the overlap
1095 load compare path. */
1096 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
1097 NULL_RTX, nonconst_overlap);
1098
1099 rtx rem4k = gen_reg_rtx (word_mode);
1100 rtx dist1 = gen_reg_rtx (word_mode);
1101 rtx dist2 = gen_reg_rtx (word_mode);
1102 do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1103 if (word_mode == SImode)
1104 emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1105 else
1106 emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
1107 do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, handle4k_label);
1108 if (word_mode == SImode)
1109 emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1110 else
1111 emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
1112 do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, handle4k_label);
1113
1114 /* We don't have a 4k boundary to deal with, so do
1115 a load/shift/compare and jump to diff. */
1116
1117 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1118 src1_addr, src2_addr, orig_src1, orig_src2);
1119
1120 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1121 JUMP_LABEL (j) = diff_label;
1122 LABEL_NUSES (diff_label) += 1;
1123 emit_barrier ();
1124
1125 /* Finally in the unlikely case we are inching up to a
1126 4k boundary we use a compact lbzx/compare loop to do
1127 it a byte at a time. */
1128
1129 emit_label (handle4k_label);
1130
1131 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1132 emit_move_insn (ctr, cmp_rem);
1133 rtx ixreg = gen_reg_rtx (Pmode);
1134 emit_move_insn (ixreg, const0_rtx);
1135
1136 rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1137 rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1138 rtx d1 = gen_reg_rtx (word_mode);
1139 rtx d2 = gen_reg_rtx (word_mode);
1140
1141 rtx fc_loop = gen_label_rtx ();
1142 emit_label (fc_loop);
1143
1144 do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1145 do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1146
1147 do_add3 (ixreg, ixreg, const1_rtx);
1148
1149 rtx cond = gen_reg_rtx (CCmode);
1150 rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1151 rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1152
1153 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1154 if (TARGET_64BIT)
1155 j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1156 eqrtx, cond));
1157 else
1158 j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1159 eqrtx, cond));
1160 JUMP_LABEL (j) = fc_loop;
1161 LABEL_NUSES (fc_loop) += 1;
1162
1163 if (TARGET_64BIT)
1164 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1165 else
1166 emit_move_insn (target, diff);
1167
1168 /* Since we are comparing bytes, the difference can be used
1169 as the final result and we are done here. */
1170 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1171 JUMP_LABEL (j) = final_label;
1172 LABEL_NUSES (final_label) += 1;
1173 emit_barrier ();
1174 }
1175 }
1176
1177 emit_label (diff_label);
1178 /* difference handling, 64->32 conversion */
1179
1180 /* We need to produce DI result from sub, then convert to target SI
1181 while maintaining <0 / ==0 / >0 properties. This sequence works:
1182 subfc L,A,B
1183 subfe H,H,H
1184 popcntd L,L
1185 rldimi L,H,6,0
1186
1187 This is an alternate one Segher cooked up if somebody
1188 wants to expand this for something that doesn't have popcntd:
1189 subfc L,a,b
1190 subfe H,x,x
1191 addic t,L,-1
1192 subfe v,t,L
1193 or z,v,H
1194
1195 And finally, p9 can just do this:
1196 cmpld A,B
1197 setb r */
1198
1199 if (TARGET_P9_MISC)
1200 emit_insn (gen_setb_unsigned (target, dcond));
1201 else
1202 {
1203 if (TARGET_64BIT)
1204 {
1205 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1206 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1207 emit_insn (gen_popcntddi2 (diff, diff));
1208 emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1209 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1210 }
1211 else
1212 {
1213 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1214 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1215 emit_insn (gen_popcntdsi2 (diff, diff));
1216 emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1217 }
1218 }
1219
1220 if (library_call_label != NULL)
1221 {
1222 /* Branch around memcmp call. */
1223 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1224 JUMP_LABEL (j) = final_label;
1225 LABEL_NUSES (final_label) += 1;
1226 emit_barrier ();
1227
1228 /* Make memcmp library call. cmp_rem is the remaining bytes that
1229 were compared and cmp_rem is the expected amount to be compared
1230 by memcmp. If we don't find a difference in the loop compare, do
1231 the library call directly instead of doing a small compare just
1232 to get to an arbitrary boundary before calling it anyway.
1233 Also, update addresses to point to the next word to examine. */
1234 emit_label (library_call_label);
1235
1236 rtx len_rtx = gen_reg_rtx (word_mode);
1237 if (bytes_is_const)
1238 {
1239 emit_move_insn (len_rtx, cmp_rem);
1240 do_add3 (src1_addr, src1_addr, iv1);
1241 do_add3 (src2_addr, src2_addr, iv1);
1242 }
1243 else
1244 emit_move_insn (len_rtx, bytes_rtx);
1245
1246 tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1247 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1248 target, LCT_NORMAL, GET_MODE (target),
1249 src1_addr, Pmode,
1250 src2_addr, Pmode,
1251 len_rtx, GET_MODE (len_rtx));
1252 }
1253
1254 /* emit final_label */
1255 emit_label (final_label);
1256 return true;
1257}
1258
8845cb37
AS
1259/* Expand a block compare operation, and return true if successful.
1260 Return false if we should let the compiler generate normal code,
1261 probably a memcmp call.
1262
1263 OPERANDS[0] is the target (result).
1264 OPERANDS[1] is the first source.
1265 OPERANDS[2] is the second source.
1266 OPERANDS[3] is the length.
1267 OPERANDS[4] is the alignment. */
1268bool
1269expand_block_compare (rtx operands[])
1270{
1271 rtx target = operands[0];
1272 rtx orig_src1 = operands[1];
1273 rtx orig_src2 = operands[2];
1274 rtx bytes_rtx = operands[3];
1275 rtx align_rtx = operands[4];
1276 HOST_WIDE_INT cmp_bytes = 0;
1277 rtx src1 = orig_src1;
1278 rtx src2 = orig_src2;
1279
1280 /* This case is complicated to handle because the subtract
1281 with carry instructions do not generate the 64-bit
1282 carry and so we must emit code to calculate it ourselves.
1283 We choose not to implement this yet. */
1284 if (TARGET_32BIT && TARGET_POWERPC64)
1285 return false;
1286
5ec3397e
AS
1287 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
1288
1289 /* Allow this param to shut off all expansion. */
1290 if (rs6000_block_compare_inline_limit == 0)
1291 return false;
1292
1293 /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1294 However slow_unaligned_access returns true on P7 even though the
1295 performance of this code is good there. */
1296 if (!isP7
1297 && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1298 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
8845cb37
AS
1299 return false;
1300
5ec3397e
AS
1301 /* Unaligned l*brx traps on P7 so don't do this. However this should
1302 not affect much because LE isn't really supported on P7 anyway. */
1303 if (isP7 && !BYTES_BIG_ENDIAN)
1304 return false;
1305
1306 /* If this is not a fixed size compare, try generating loop code and
1307 if that fails just call memcmp. */
1308 if (!CONST_INT_P (bytes_rtx))
1309 return expand_compare_loop (operands);
1310
8845cb37
AS
1311 /* This must be a fixed size alignment. */
1312 if (!CONST_INT_P (align_rtx))
1313 return false;
1314
1315 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
1316
8845cb37
AS
1317 gcc_assert (GET_MODE (target) == SImode);
1318
1319 /* Anything to move? */
1320 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1321 if (bytes == 0)
1322 return true;
1323
8845cb37
AS
1324 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1325 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1326 /* P7/P8 code uses cond for subfc. but P9 uses
1327 it for cmpld which needs CCUNSmode. */
1328 rtx cond;
1329 if (TARGET_P9_MISC)
1330 cond = gen_reg_rtx (CCUNSmode);
1331 else
1332 cond = gen_reg_rtx (CCmode);
1333
1334 /* If we have an LE target without ldbrx and word_mode is DImode,
1335 then we must avoid using word_mode. */
1336 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
1337 && word_mode == DImode);
1338
1339 /* Strategy phase. How many ops will this take and should we expand it? */
1340
1341 unsigned HOST_WIDE_INT offset = 0;
1342 machine_mode load_mode =
1343 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
1344 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
1345
5ec3397e
AS
1346 /* We don't want to generate too much code. The loop code can take
1347 over for lengths greater than 31 bytes. */
1348 unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
8845cb37 1349 if (!IN_RANGE (bytes, 1, max_bytes))
5ec3397e
AS
1350 return expand_compare_loop (operands);
1351
1352 /* The code generated for p7 and older is not faster than glibc
1353 memcmp if alignment is small and length is not short, so bail
1354 out to avoid those conditions. */
1355 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
1356 && ((base_align == 1 && bytes > 16)
1357 || (base_align == 2 && bytes > 32)))
8845cb37
AS
1358 return false;
1359
1360 bool generate_6432_conversion = false;
1361 rtx convert_label = NULL;
1362 rtx final_label = NULL;
1363
1364 /* Example of generated code for 18 bytes aligned 1 byte.
1365 Compiled with -fno-reorder-blocks for clarity.
1366 ldbrx 10,31,8
1367 ldbrx 9,7,8
1368 subfc. 9,9,10
1369 bne 0,.L6487
1370 addi 9,12,8
1371 addi 5,11,8
1372 ldbrx 10,0,9
1373 ldbrx 9,0,5
1374 subfc. 9,9,10
1375 bne 0,.L6487
1376 addi 9,12,16
1377 lhbrx 10,0,9
1378 addi 9,11,16
1379 lhbrx 9,0,9
1380 subf 9,9,10
1381 b .L6488
1382 .p2align 4,,15
1383 .L6487: #convert_label
1384 popcntd 9,9
1385 subfe 10,10,10
1386 or 9,9,10
1387 .L6488: #final_label
1388 extsw 10,9
1389
1390 We start off with DImode for two blocks that jump to the DI->SI conversion
1391 if the difference is found there, then a final block of HImode that skips
1392 the DI->SI conversion. */
1393
1394 while (bytes > 0)
1395 {
1396 unsigned int align = compute_current_alignment (base_align, offset);
1397 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1398 load_mode = select_block_compare_mode (offset, bytes, align,
1399 word_mode_ok);
1400 else
1401 load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
1402 load_mode_size = GET_MODE_SIZE (load_mode);
1403 if (bytes >= load_mode_size)
1404 cmp_bytes = load_mode_size;
1405 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1406 {
1407 /* Move this load back so it doesn't go past the end.
1408 P8/P9 can do this efficiently. */
1409 unsigned int extra_bytes = load_mode_size - bytes;
1410 cmp_bytes = bytes;
1411 if (extra_bytes < offset)
1412 {
1413 offset -= extra_bytes;
1414 cmp_bytes = load_mode_size;
1415 bytes = cmp_bytes;
1416 }
1417 }
1418 else
1419 /* P7 and earlier can't do the overlapping load trick fast,
1420 so this forces a non-overlapping load and a shift to get
1421 rid of the extra bytes. */
1422 cmp_bytes = bytes;
1423
1424 src1 = adjust_address (orig_src1, load_mode, offset);
1425 src2 = adjust_address (orig_src2, load_mode, offset);
1426
1427 if (!REG_P (XEXP (src1, 0)))
1428 {
1429 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1430 src1 = replace_equiv_address (src1, src1_reg);
1431 }
f4f867ca 1432 set_mem_size (src1, load_mode_size);
8845cb37
AS
1433
1434 if (!REG_P (XEXP (src2, 0)))
1435 {
1436 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1437 src2 = replace_equiv_address (src2, src2_reg);
1438 }
f4f867ca 1439 set_mem_size (src2, load_mode_size);
8845cb37
AS
1440
1441 do_load_for_compare (tmp_reg_src1, src1, load_mode);
1442 do_load_for_compare (tmp_reg_src2, src2, load_mode);
1443
1444 if (cmp_bytes < load_mode_size)
1445 {
1446 /* Shift unneeded bytes off. */
1447 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1448 if (word_mode == DImode)
1449 {
1450 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1451 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1452 }
1453 else
1454 {
1455 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1456 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1457 }
1458 }
1459
1460 int remain = bytes - cmp_bytes;
1461 if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
1462 {
1463 /* Target is larger than load size so we don't need to
1464 reduce result size. */
1465
1466 /* We previously did a block that need 64->32 conversion but
1467 the current block does not, so a label is needed to jump
1468 to the end. */
1469 if (generate_6432_conversion && !final_label)
1470 final_label = gen_label_rtx ();
1471
1472 if (remain > 0)
1473 {
1474 /* This is not the last block, branch to the end if the result
1475 of this subtract is not zero. */
1476 if (!final_label)
1477 final_label = gen_label_rtx ();
1478 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1479 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1480 rtx cr = gen_reg_rtx (CCmode);
1481 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
1482 emit_insn (gen_movsi (target,
1483 gen_lowpart (SImode, tmp_reg_src2)));
1484 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1485 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1486 fin_ref, pc_rtx);
1487 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1488 JUMP_LABEL (j) = final_label;
1489 LABEL_NUSES (final_label) += 1;
1490 }
1491 else
1492 {
1493 if (word_mode == DImode)
1494 {
1495 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1496 tmp_reg_src2));
1497 emit_insn (gen_movsi (target,
1498 gen_lowpart (SImode, tmp_reg_src2)));
1499 }
1500 else
1501 emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
1502
1503 if (final_label)
1504 {
1505 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1506 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
5ec3397e 1507 JUMP_LABEL (j) = final_label;
8845cb37
AS
1508 LABEL_NUSES (final_label) += 1;
1509 emit_barrier ();
1510 }
1511 }
1512 }
1513 else
1514 {
1515 /* Do we need a 64->32 conversion block? We need the 64->32
1516 conversion even if target size == load_mode size because
1517 the subtract generates one extra bit. */
1518 generate_6432_conversion = true;
1519
1520 if (remain > 0)
1521 {
1522 if (!convert_label)
1523 convert_label = gen_label_rtx ();
1524
1525 /* Compare to zero and branch to convert_label if not zero. */
1526 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1527 if (TARGET_P9_MISC)
1528 {
1529 /* Generate a compare, and convert with a setb later. */
1530 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1531 tmp_reg_src2);
1532 emit_insn (gen_rtx_SET (cond, cmp));
1533 }
1534 else
1535 /* Generate a subfc. and use the longer
1536 sequence for conversion. */
1537 if (TARGET_64BIT)
1538 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1539 tmp_reg_src1, cond));
1540 else
1541 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
1542 tmp_reg_src1, cond));
1543 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1544 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1545 cvt_ref, pc_rtx);
1546 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
5ec3397e 1547 JUMP_LABEL (j) = convert_label;
8845cb37
AS
1548 LABEL_NUSES (convert_label) += 1;
1549 }
1550 else
1551 {
1552 /* Just do the subtract/compare. Since this is the last block
1553 the convert code will be generated immediately following. */
1554 if (TARGET_P9_MISC)
1555 {
1556 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1557 tmp_reg_src2);
1558 emit_insn (gen_rtx_SET (cond, cmp));
1559 }
1560 else
1561 if (TARGET_64BIT)
1562 emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
1563 tmp_reg_src1));
1564 else
1565 emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
1566 tmp_reg_src1));
1567 }
1568 }
1569
1570 offset += cmp_bytes;
1571 bytes -= cmp_bytes;
1572 }
1573
1574 if (generate_6432_conversion)
1575 {
1576 if (convert_label)
1577 emit_label (convert_label);
1578
1579 /* We need to produce DI result from sub, then convert to target SI
1580 while maintaining <0 / ==0 / >0 properties. This sequence works:
1581 subfc L,A,B
1582 subfe H,H,H
1583 popcntd L,L
1584 rldimi L,H,6,0
1585
1586 This is an alternate one Segher cooked up if somebody
1587 wants to expand this for something that doesn't have popcntd:
1588 subfc L,a,b
1589 subfe H,x,x
1590 addic t,L,-1
1591 subfe v,t,L
1592 or z,v,H
1593
1594 And finally, p9 can just do this:
1595 cmpld A,B
1596 setb r */
1597
1598 if (TARGET_P9_MISC)
1599 {
1600 emit_insn (gen_setb_unsigned (target, cond));
1601 }
1602 else
1603 {
1604 if (TARGET_64BIT)
1605 {
1606 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1607 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1608 emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
1609 emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
1610 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
1611 }
1612 else
1613 {
1614 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1615 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1616 emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
1617 emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
1618 }
1619 }
1620 }
1621
1622 if (final_label)
1623 emit_label (final_label);
1624
1625 gcc_assert (bytes == 0);
1626 return true;
1627}
1628
1629/* Generate alignment check and branch code to set up for
1630 strncmp when we don't have DI alignment.
1631 STRNCMP_LABEL is the label to branch if there is a page crossing.
1632 SRC is the string pointer to be examined.
1633 BYTES is the max number of bytes to compare. */
1634static void
1635expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
1636{
1637 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
1638 rtx src_check = copy_addr_to_reg (XEXP (src, 0));
1639 if (GET_MODE (src_check) == SImode)
1640 emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff)));
1641 else
1642 emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff)));
1643 rtx cond = gen_reg_rtx (CCmode);
1644 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check,
1645 GEN_INT (4096 - bytes)));
1646
0c791c59 1647 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
8845cb37
AS
1648
1649 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
0c791c59 1650 lab_ref, pc_rtx);
8845cb37
AS
1651 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1652 JUMP_LABEL (j) = strncmp_label;
1653 LABEL_NUSES (strncmp_label) += 1;
1654}
1655
1656/* Expand a string compare operation with length, and return
1657 true if successful. Return false if we should let the
1658 compiler generate normal code, probably a strncmp call.
1659
1660 OPERANDS[0] is the target (result).
1661 OPERANDS[1] is the first source.
1662 OPERANDS[2] is the second source.
1663 If NO_LENGTH is zero, then:
1664 OPERANDS[3] is the length.
1665 OPERANDS[4] is the alignment in bytes.
1666 If NO_LENGTH is nonzero, then:
1667 OPERANDS[3] is the alignment in bytes. */
1668bool
1669expand_strn_compare (rtx operands[], int no_length)
1670{
1671 rtx target = operands[0];
1672 rtx orig_src1 = operands[1];
1673 rtx orig_src2 = operands[2];
1674 rtx bytes_rtx, align_rtx;
1675 if (no_length)
1676 {
1677 bytes_rtx = NULL;
1678 align_rtx = operands[3];
1679 }
1680 else
1681 {
1682 bytes_rtx = operands[3];
1683 align_rtx = operands[4];
1684 }
1685 unsigned HOST_WIDE_INT cmp_bytes = 0;
1686 rtx src1 = orig_src1;
1687 rtx src2 = orig_src2;
1688
1689 /* If we have a length, it must be constant. This simplifies things
1690 a bit as we don't have to generate code to check if we've exceeded
1691 the length. Later this could be expanded to handle this case. */
1692 if (!no_length && !CONST_INT_P (bytes_rtx))
1693 return false;
1694
1695 /* This must be a fixed size alignment. */
1696 if (!CONST_INT_P (align_rtx))
1697 return false;
1698
1699 unsigned int base_align = UINTVAL (align_rtx);
1700 int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
1701 int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
1702
e0bd6c9f
RS
1703 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
1704 if (targetm.slow_unaligned_access (word_mode, align1)
1705 || targetm.slow_unaligned_access (word_mode, align2))
8845cb37
AS
1706 return false;
1707
1708 gcc_assert (GET_MODE (target) == SImode);
1709
1710 /* If we have an LE target without ldbrx and word_mode is DImode,
1711 then we must avoid using word_mode. */
1712 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
1713 && word_mode == DImode);
1714
1715 unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
1716
1717 unsigned HOST_WIDE_INT offset = 0;
1718 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
1719 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
1720 if (no_length)
1721 /* Use this as a standin to determine the mode to use. */
1722 bytes = rs6000_string_compare_inline_limit * word_mode_size;
1723 else
1724 bytes = UINTVAL (bytes_rtx);
1725
1726 machine_mode load_mode =
1727 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
1728 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
1729 compare_length = rs6000_string_compare_inline_limit * load_mode_size;
1730
1731 /* If we have equality at the end of the last compare and we have not
1732 found the end of the string, we need to call strcmp/strncmp to
1733 compare the remainder. */
1734 bool equality_compare_rest = false;
1735
1736 if (no_length)
1737 {
1738 bytes = compare_length;
1739 equality_compare_rest = true;
1740 }
1741 else
1742 {
1743 if (bytes <= compare_length)
1744 compare_length = bytes;
1745 else
1746 equality_compare_rest = true;
1747 }
1748
1749 rtx result_reg = gen_reg_rtx (word_mode);
1750 rtx final_move_label = gen_label_rtx ();
1751 rtx final_label = gen_label_rtx ();
1752 rtx begin_compare_label = NULL;
1753
1754 if (base_align < 8)
1755 {
1756 /* Generate code that checks distance to 4k boundary for this case. */
1757 begin_compare_label = gen_label_rtx ();
1758 rtx strncmp_label = gen_label_rtx ();
1759 rtx jmp;
1760
1761 /* Strncmp for power8 in glibc does this:
5ec3397e
AS
1762 rldicl r8,r3,0,52
1763 cmpldi cr7,r8,4096-16
1764 bgt cr7,L(pagecross) */
8845cb37
AS
1765
1766 /* Make sure that the length we use for the alignment test and
1767 the subsequent code generation are in agreement so we do not
1768 go past the length we tested for a 4k boundary crossing. */
1769 unsigned HOST_WIDE_INT align_test = compare_length;
1770 if (align_test < 8)
1771 {
1772 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
1773 base_align = align_test;
1774 }
1775 else
1776 {
1777 align_test = ROUND_UP (align_test, 8);
1778 base_align = 8;
1779 }
1780
1781 if (align1 < 8)
1782 expand_strncmp_align_check (strncmp_label, src1, align_test);
1783 if (align2 < 8)
1784 expand_strncmp_align_check (strncmp_label, src2, align_test);
1785
1786 /* Now generate the following sequence:
1787 - branch to begin_compare
1788 - strncmp_label
1789 - call to strncmp
1790 - branch to final_label
1791 - begin_compare_label */
1792
1793 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
1794 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
1795 JUMP_LABEL (jmp) = begin_compare_label;
1796 LABEL_NUSES (begin_compare_label) += 1;
1797 emit_barrier ();
1798
1799 emit_label (strncmp_label);
1800
1801 if (!REG_P (XEXP (src1, 0)))
1802 {
1803 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1804 src1 = replace_equiv_address (src1, src1_reg);
1805 }
1806
1807 if (!REG_P (XEXP (src2, 0)))
1808 {
1809 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1810 src2 = replace_equiv_address (src2, src2_reg);
1811 }
1812
1813 if (no_length)
1814 {
1815 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
1816 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 1817 target, LCT_NORMAL, GET_MODE (target),
8845cb37
AS
1818 force_reg (Pmode, XEXP (src1, 0)), Pmode,
1819 force_reg (Pmode, XEXP (src2, 0)), Pmode);
1820 }
1821 else
1822 {
1823 /* -m32 -mpowerpc64 results in word_mode being DImode even
1824 though otherwise it is 32-bit. The length arg to strncmp
1825 is a size_t which will be the same size as pointers. */
1826 rtx len_rtx;
1827 if (TARGET_64BIT)
1828 len_rtx = gen_reg_rtx (DImode);
1829 else
1830 len_rtx = gen_reg_rtx (SImode);
1831
1832 emit_move_insn (len_rtx, bytes_rtx);
1833
1834 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
1835 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 1836 target, LCT_NORMAL, GET_MODE (target),
8845cb37
AS
1837 force_reg (Pmode, XEXP (src1, 0)), Pmode,
1838 force_reg (Pmode, XEXP (src2, 0)), Pmode,
1839 len_rtx, GET_MODE (len_rtx));
1840 }
1841
1842 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1843 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1844 JUMP_LABEL (jmp) = final_label;
1845 LABEL_NUSES (final_label) += 1;
1846 emit_barrier ();
1847 emit_label (begin_compare_label);
1848 }
1849
1850 rtx cleanup_label = NULL;
1851 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1852 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1853
1854 /* Generate sequence of ld/ldbrx, cmpb to compare out
1855 to the length specified. */
1856 unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
1857 while (bytes_to_compare > 0)
1858 {
1859 /* Compare sequence:
1860 check each 8B with: ld/ld cmpd bne
1861 If equal, use rldicr/cmpb to check for zero byte.
1862 cleanup code at end:
1863 cmpb get byte that differs
1864 cmpb look for zero byte
1865 orc combine
1866 cntlzd get bit of first zero/diff byte
1867 subfic convert for rldcl use
1868 rldcl rldcl extract diff/zero byte
1869 subf subtract for final result
1870
1871 The last compare can branch around the cleanup code if the
1872 result is zero because the strings are exactly equal. */
1873 unsigned int align = compute_current_alignment (base_align, offset);
1874 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1875 load_mode = select_block_compare_mode (offset, bytes_to_compare, align,
1876 word_mode_ok);
1877 else
1878 load_mode = select_block_compare_mode (0, bytes_to_compare, align,
1879 word_mode_ok);
1880 load_mode_size = GET_MODE_SIZE (load_mode);
1881 if (bytes_to_compare >= load_mode_size)
1882 cmp_bytes = load_mode_size;
1883 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1884 {
1885 /* Move this load back so it doesn't go past the end.
1886 P8/P9 can do this efficiently. */
1887 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
1888 cmp_bytes = bytes_to_compare;
1889 if (extra_bytes < offset)
1890 {
1891 offset -= extra_bytes;
1892 cmp_bytes = load_mode_size;
1893 bytes_to_compare = cmp_bytes;
1894 }
1895 }
1896 else
1897 /* P7 and earlier can't do the overlapping load trick fast,
1898 so this forces a non-overlapping load and a shift to get
1899 rid of the extra bytes. */
1900 cmp_bytes = bytes_to_compare;
1901
1902 src1 = adjust_address (orig_src1, load_mode, offset);
1903 src2 = adjust_address (orig_src2, load_mode, offset);
1904
1905 if (!REG_P (XEXP (src1, 0)))
1906 {
1907 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1908 src1 = replace_equiv_address (src1, src1_reg);
1909 }
708eab9b 1910 set_mem_size (src1, load_mode_size);
8845cb37
AS
1911
1912 if (!REG_P (XEXP (src2, 0)))
1913 {
1914 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1915 src2 = replace_equiv_address (src2, src2_reg);
1916 }
708eab9b 1917 set_mem_size (src2, load_mode_size);
8845cb37
AS
1918
1919 do_load_for_compare (tmp_reg_src1, src1, load_mode);
1920 do_load_for_compare (tmp_reg_src2, src2, load_mode);
1921
1922 /* We must always left-align the data we read, and
1923 clear any bytes to the right that are beyond the string.
1924 Otherwise the cmpb sequence won't produce the correct
1925 results. The beginning of the compare will be done
1926 with word_mode so will not have any extra shifts or
1927 clear rights. */
1928
1929 if (load_mode_size < word_mode_size)
1930 {
1931 /* Rotate left first. */
1932 rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
1933 if (word_mode == DImode)
1934 {
1935 emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh));
1936 emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh));
1937 }
1938 else
1939 {
1940 emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1941 emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1942 }
1943 }
1944
1945 if (cmp_bytes < word_mode_size)
1946 {
1947 /* Now clear right. This plus the rotate can be
1948 turned into a rldicr instruction. */
1949 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
1950 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
1951 if (word_mode == DImode)
1952 {
1953 emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
1954 emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
1955 }
1956 else
1957 {
1958 emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
1959 emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
1960 }
1961 }
1962
1963 /* Cases to handle. A and B are chunks of the two strings.
1964 1: Not end of comparison:
1965 A != B: branch to cleanup code to compute result.
1966 A == B: check for 0 byte, next block if not found.
1967 2: End of the inline comparison:
1968 A != B: branch to cleanup code to compute result.
1969 A == B: check for 0 byte, call strcmp/strncmp
1970 3: compared requested N bytes:
1971 A == B: branch to result 0.
1972 A != B: cleanup code to compute result. */
1973
1974 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
1975
1976 rtx dst_label;
1977 if (remain > 0 || equality_compare_rest)
1978 {
1979 /* Branch to cleanup code, otherwise fall through to do
1980 more compares. */
1981 if (!cleanup_label)
1982 cleanup_label = gen_label_rtx ();
1983 dst_label = cleanup_label;
1984 }
1985 else
1986 /* Branch to end and produce result of 0. */
1987 dst_label = final_move_label;
1988
1989 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
1990 rtx cond = gen_reg_rtx (CCmode);
1991
1992 /* Always produce the 0 result, it is needed if
1993 cmpb finds a 0 byte in this chunk. */
1994 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1995 rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
1996
1997 rtx cmp_rtx;
1998 if (remain == 0 && !equality_compare_rest)
1999 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
2000 else
2001 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2002
2003 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2004 lab_ref, pc_rtx);
2005 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2006 JUMP_LABEL (j) = dst_label;
2007 LABEL_NUSES (dst_label) += 1;
2008
2009 if (remain > 0 || equality_compare_rest)
2010 {
2011 /* Generate a cmpb to test for a 0 byte and branch
2012 to final result if found. */
2013 rtx cmpb_zero = gen_reg_rtx (word_mode);
2014 rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2015 rtx condz = gen_reg_rtx (CCmode);
2016 rtx zero_reg = gen_reg_rtx (word_mode);
2017 if (word_mode == SImode)
2018 {
2019 emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
2020 emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
2021 if (cmp_bytes < word_mode_size)
2022 {
2023 /* Don't want to look at zero bytes past end. */
2024 HOST_WIDE_INT mb =
2025 BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2026 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2027 emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask));
2028 }
2029 }
2030 else
2031 {
2032 emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
2033 emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
2034 if (cmp_bytes < word_mode_size)
2035 {
2036 /* Don't want to look at zero bytes past end. */
2037 HOST_WIDE_INT mb =
2038 BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2039 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2040 emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask));
2041 }
2042 }
2043
2044 emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
2045 rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
2046 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
2047 lab_ref_fin, pc_rtx);
2048 rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2049 JUMP_LABEL (j2) = final_move_label;
2050 LABEL_NUSES (final_move_label) += 1;
2051
2052 }
2053
2054 offset += cmp_bytes;
2055 bytes_to_compare -= cmp_bytes;
2056 }
2057
2058 if (equality_compare_rest)
2059 {
2060 /* Update pointers past what has been compared already. */
2061 src1 = adjust_address (orig_src1, load_mode, offset);
2062 src2 = adjust_address (orig_src2, load_mode, offset);
2063
2064 if (!REG_P (XEXP (src1, 0)))
2065 {
2066 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
2067 src1 = replace_equiv_address (src1, src1_reg);
2068 }
708eab9b 2069 set_mem_size (src1, load_mode_size);
8845cb37
AS
2070
2071 if (!REG_P (XEXP (src2, 0)))
2072 {
2073 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
2074 src2 = replace_equiv_address (src2, src2_reg);
2075 }
708eab9b 2076 set_mem_size (src2, load_mode_size);
8845cb37
AS
2077
2078 /* Construct call to strcmp/strncmp to compare the rest of the string. */
2079 if (no_length)
2080 {
2081 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2082 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2083 target, LCT_NORMAL, GET_MODE (target),
8845cb37
AS
2084 force_reg (Pmode, XEXP (src1, 0)), Pmode,
2085 force_reg (Pmode, XEXP (src2, 0)), Pmode);
2086 }
2087 else
2088 {
2089 rtx len_rtx;
2090 if (TARGET_64BIT)
2091 len_rtx = gen_reg_rtx (DImode);
2092 else
2093 len_rtx = gen_reg_rtx (SImode);
2094
2095 emit_move_insn (len_rtx, GEN_INT (bytes - compare_length));
2096 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2097 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2098 target, LCT_NORMAL, GET_MODE (target),
8845cb37
AS
2099 force_reg (Pmode, XEXP (src1, 0)), Pmode,
2100 force_reg (Pmode, XEXP (src2, 0)), Pmode,
2101 len_rtx, GET_MODE (len_rtx));
2102 }
2103
2104 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2105 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2106 JUMP_LABEL (jmp) = final_label;
2107 LABEL_NUSES (final_label) += 1;
2108 emit_barrier ();
2109 }
2110
2111 if (cleanup_label)
2112 emit_label (cleanup_label);
2113
2114 /* Generate the final sequence that identifies the differing
2115 byte and generates the final result, taking into account
2116 zero bytes:
2117
2118 cmpb cmpb_result1, src1, src2
2119 cmpb cmpb_result2, src1, zero
2120 orc cmpb_result1, cmp_result1, cmpb_result2
2121 cntlzd get bit of first zero/diff byte
2122 addi convert for rldcl use
2123 rldcl rldcl extract diff/zero byte
2124 subf subtract for final result
2125 */
2126
2127 rtx cmpb_diff = gen_reg_rtx (word_mode);
2128 rtx cmpb_zero = gen_reg_rtx (word_mode);
2129 rtx rot_amt = gen_reg_rtx (word_mode);
2130 rtx zero_reg = gen_reg_rtx (word_mode);
2131
2132 rtx rot1_1 = gen_reg_rtx (word_mode);
2133 rtx rot1_2 = gen_reg_rtx (word_mode);
2134 rtx rot2_1 = gen_reg_rtx (word_mode);
2135 rtx rot2_2 = gen_reg_rtx (word_mode);
2136
2137 if (word_mode == SImode)
2138 {
2139 emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
2140 emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
2141 emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
2142 emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
2143 emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
2144 emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
2145 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
2146 emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1,
2147 gen_lowpart (SImode, rot_amt)));
2148 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2149 emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2,
2150 gen_lowpart (SImode, rot_amt)));
2151 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2152 emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2));
2153 }
2154 else
2155 {
2156 emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
2157 emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
2158 emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
2159 emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
2160 emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
2161 emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
2162 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
2163 emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1,
2164 gen_lowpart (SImode, rot_amt)));
2165 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2166 emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2,
2167 gen_lowpart (SImode, rot_amt)));
2168 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2169 emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2));
2170 }
2171
2172 emit_label (final_move_label);
2173 emit_insn (gen_movsi (target,
2174 gen_lowpart (SImode, result_reg)));
2175 emit_label (final_label);
2176 return true;
2177}
2178
2179/* Expand a block move operation, and return 1 if successful. Return 0
2180 if we should let the compiler generate normal code.
2181
2182 operands[0] is the destination
2183 operands[1] is the source
2184 operands[2] is the length
2185 operands[3] is the alignment */
2186
2187#define MAX_MOVE_REG 4
2188
2189int
2190expand_block_move (rtx operands[])
2191{
2192 rtx orig_dest = operands[0];
2193 rtx orig_src = operands[1];
2194 rtx bytes_rtx = operands[2];
2195 rtx align_rtx = operands[3];
2196 int constp = (GET_CODE (bytes_rtx) == CONST_INT);
2197 int align;
2198 int bytes;
2199 int offset;
2200 int move_bytes;
2201 rtx stores[MAX_MOVE_REG];
2202 int num_reg = 0;
2203
2204 /* If this is not a fixed size move, just call memcpy */
2205 if (! constp)
2206 return 0;
2207
2208 /* This must be a fixed size alignment */
2209 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
2210 align = INTVAL (align_rtx) * BITS_PER_UNIT;
2211
2212 /* Anything to move? */
2213 bytes = INTVAL (bytes_rtx);
2214 if (bytes <= 0)
2215 return 1;
2216
2217 if (bytes > rs6000_block_move_inline_limit)
2218 return 0;
2219
2220 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2221 {
2222 union {
2223 rtx (*movmemsi) (rtx, rtx, rtx, rtx);
2224 rtx (*mov) (rtx, rtx);
2225 } gen_func;
2226 machine_mode mode = BLKmode;
2227 rtx src, dest;
2228
2229 /* Altivec first, since it will be faster than a string move
2230 when it applies, and usually not significantly larger. */
3b0cb1a5 2231 if (TARGET_ALTIVEC && bytes >= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX || align >= 128))
8845cb37
AS
2232 {
2233 move_bytes = 16;
2234 mode = V4SImode;
2235 gen_func.mov = gen_movv4si;
2236 }
8845cb37
AS
2237 else if (bytes >= 8 && TARGET_POWERPC64
2238 && (align >= 64 || !STRICT_ALIGNMENT))
2239 {
2240 move_bytes = 8;
2241 mode = DImode;
2242 gen_func.mov = gen_movdi;
2243 if (offset == 0 && align < 64)
2244 {
2245 rtx addr;
2246
2247 /* If the address form is reg+offset with offset not a
2248 multiple of four, reload into reg indirect form here
2249 rather than waiting for reload. This way we get one
2250 reload, not one per load and/or store. */
2251 addr = XEXP (orig_dest, 0);
2252 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2253 && GET_CODE (XEXP (addr, 1)) == CONST_INT
2254 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2255 {
2256 addr = copy_addr_to_reg (addr);
2257 orig_dest = replace_equiv_address (orig_dest, addr);
2258 }
2259 addr = XEXP (orig_src, 0);
2260 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2261 && GET_CODE (XEXP (addr, 1)) == CONST_INT
2262 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2263 {
2264 addr = copy_addr_to_reg (addr);
2265 orig_src = replace_equiv_address (orig_src, addr);
2266 }
2267 }
2268 }
8845cb37
AS
2269 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2270 { /* move 4 bytes */
2271 move_bytes = 4;
2272 mode = SImode;
2273 gen_func.mov = gen_movsi;
2274 }
2275 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2276 { /* move 2 bytes */
2277 move_bytes = 2;
2278 mode = HImode;
2279 gen_func.mov = gen_movhi;
2280 }
8845cb37
AS
2281 else /* move 1 byte at a time */
2282 {
2283 move_bytes = 1;
2284 mode = QImode;
2285 gen_func.mov = gen_movqi;
2286 }
2287
2288 src = adjust_address (orig_src, mode, offset);
2289 dest = adjust_address (orig_dest, mode, offset);
2290
2291 if (mode != BLKmode)
2292 {
2293 rtx tmp_reg = gen_reg_rtx (mode);
2294
2295 emit_insn ((*gen_func.mov) (tmp_reg, src));
2296 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2297 }
2298
2299 if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
2300 {
2301 int i;
2302 for (i = 0; i < num_reg; i++)
2303 emit_insn (stores[i]);
2304 num_reg = 0;
2305 }
2306
2307 if (mode == BLKmode)
2308 {
2309 /* Move the address into scratch registers. The movmemsi
2310 patterns require zero offset. */
2311 if (!REG_P (XEXP (src, 0)))
2312 {
2313 rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
2314 src = replace_equiv_address (src, src_reg);
2315 }
2316 set_mem_size (src, move_bytes);
2317
2318 if (!REG_P (XEXP (dest, 0)))
2319 {
2320 rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
2321 dest = replace_equiv_address (dest, dest_reg);
2322 }
2323 set_mem_size (dest, move_bytes);
2324
2325 emit_insn ((*gen_func.movmemsi) (dest, src,
2326 GEN_INT (move_bytes & 31),
2327 align_rtx));
2328 }
2329 }
2330
2331 return 1;
2332}