]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/rs6000/rs6000-string.c
re PR target/89112 (Incorrect code generated by rs6000 memcmp expansion)
[thirdparty/gcc.git] / gcc / config / rs6000 / rs6000-string.c
CommitLineData
8845cb37
AS
1/* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
a5544970 3 Copyright (C) 1991-2019 Free Software Foundation, Inc.
8845cb37
AS
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
8fcc61f8
RS
21#define IN_TARGET_CODE 1
22
8845cb37
AS
23#include "config.h"
24#include "system.h"
25#include "coretypes.h"
26#include "backend.h"
27#include "rtl.h"
28#include "tree.h"
29#include "memmodel.h"
30#include "tm_p.h"
31#include "ira.h"
32#include "print-tree.h"
33#include "varasm.h"
34#include "explow.h"
35#include "expr.h"
36#include "output.h"
e0bd6c9f 37#include "target.h"
8845cb37
AS
38
39/* Expand a block clear operation, and return 1 if successful. Return 0
40 if we should let the compiler generate normal code.
41
42 operands[0] is the destination
43 operands[1] is the length
44 operands[3] is the alignment */
45
46int
47expand_block_clear (rtx operands[])
48{
49 rtx orig_dest = operands[0];
50 rtx bytes_rtx = operands[1];
51 rtx align_rtx = operands[3];
2e42a52f 52 bool constp = CONST_INT_P (bytes_rtx);
8845cb37
AS
53 HOST_WIDE_INT align;
54 HOST_WIDE_INT bytes;
55 int offset;
56 int clear_bytes;
57 int clear_step;
58
59 /* If this is not a fixed size move, just call memcpy */
60 if (! constp)
61 return 0;
62
63 /* This must be a fixed size alignment */
2e42a52f 64 gcc_assert (CONST_INT_P (align_rtx));
8845cb37
AS
65 align = INTVAL (align_rtx) * BITS_PER_UNIT;
66
67 /* Anything to clear? */
68 bytes = INTVAL (bytes_rtx);
69 if (bytes <= 0)
70 return 1;
71
72 /* Use the builtin memset after a point, to avoid huge code bloat.
73 When optimize_size, avoid any significant code bloat; calling
74 memset is about 4 instructions, so allow for one instruction to
75 load zero and three to do clearing. */
3b0cb1a5 76 if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
8845cb37
AS
77 clear_step = 16;
78 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
79 clear_step = 8;
80 else
81 clear_step = 4;
82
83 if (optimize_size && bytes > 3 * clear_step)
84 return 0;
85 if (! optimize_size && bytes > 8 * clear_step)
86 return 0;
87
645eee74
AS
88 bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
89
8845cb37
AS
90 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
91 {
92 machine_mode mode = BLKmode;
93 rtx dest;
94
31369f5a 95 if (TARGET_ALTIVEC
645eee74 96 && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
8845cb37
AS
97 {
98 clear_bytes = 16;
99 mode = V4SImode;
100 }
101 else if (bytes >= 8 && TARGET_POWERPC64
102 && (align >= 64 || !STRICT_ALIGNMENT))
103 {
104 clear_bytes = 8;
105 mode = DImode;
106 if (offset == 0 && align < 64)
107 {
108 rtx addr;
109
110 /* If the address form is reg+offset with offset not a
111 multiple of four, reload into reg indirect form here
112 rather than waiting for reload. This way we get one
113 reload, not one per store. */
114 addr = XEXP (orig_dest, 0);
115 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2e42a52f 116 && CONST_INT_P (XEXP (addr, 1))
8845cb37
AS
117 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
118 {
119 addr = copy_addr_to_reg (addr);
120 orig_dest = replace_equiv_address (orig_dest, addr);
121 }
122 }
123 }
124 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
125 { /* move 4 bytes */
126 clear_bytes = 4;
127 mode = SImode;
128 }
129 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
130 { /* move 2 bytes */
131 clear_bytes = 2;
132 mode = HImode;
133 }
134 else /* move 1 byte at a time */
135 {
136 clear_bytes = 1;
137 mode = QImode;
138 }
139
140 dest = adjust_address (orig_dest, mode, offset);
141
142 emit_move_insn (dest, CONST0_RTX (mode));
143 }
144
145 return 1;
146}
147
148/* Figure out the correct instructions to generate to load data for
149 block compare. MODE is used for the read from memory, and
150 data is zero extended if REG is wider than MODE. If LE code
151 is being generated, bswap loads are used.
152
153 REG is the destination register to move the data into.
154 MEM is the memory block being read.
155 MODE is the mode of memory to use for the read. */
156static void
157do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
158{
159 switch (GET_MODE (reg))
160 {
9d36bd3b
AS
161 case E_V16QImode:
162 switch (mode)
163 {
164 case E_V16QImode:
165 if (!BYTES_BIG_ENDIAN)
166 {
167 if (TARGET_P9_VECTOR)
168 emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem));
169 else
170 {
171 rtx reg_v2di = simplify_gen_subreg (V2DImode, reg,
172 V16QImode, 0);
173 gcc_assert (MEM_P (mem));
174 rtx addr = XEXP (mem, 0);
175 rtx mem_v2di = gen_rtx_MEM (V2DImode, addr);
176 MEM_COPY_ATTRIBUTES (mem_v2di, mem);
177 set_mem_size (mem, GET_MODE_SIZE (V2DImode));
178 emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di));
179 }
180 }
181 else
182 emit_insn (gen_vsx_movv2di_64bit (reg, mem));
183 break;
184 default:
185 gcc_unreachable ();
186 }
187 break;
4e10a5a7 188 case E_DImode:
8845cb37
AS
189 switch (mode)
190 {
4e10a5a7 191 case E_QImode:
8845cb37
AS
192 emit_insn (gen_zero_extendqidi2 (reg, mem));
193 break;
4e10a5a7 194 case E_HImode:
8845cb37
AS
195 {
196 rtx src = mem;
197 if (!BYTES_BIG_ENDIAN)
198 {
199 src = gen_reg_rtx (HImode);
200 emit_insn (gen_bswaphi2 (src, mem));
201 }
202 emit_insn (gen_zero_extendhidi2 (reg, src));
203 break;
204 }
4e10a5a7 205 case E_SImode:
8845cb37
AS
206 {
207 rtx src = mem;
208 if (!BYTES_BIG_ENDIAN)
209 {
210 src = gen_reg_rtx (SImode);
211 emit_insn (gen_bswapsi2 (src, mem));
212 }
213 emit_insn (gen_zero_extendsidi2 (reg, src));
214 }
215 break;
4e10a5a7 216 case E_DImode:
8845cb37
AS
217 if (!BYTES_BIG_ENDIAN)
218 emit_insn (gen_bswapdi2 (reg, mem));
219 else
220 emit_insn (gen_movdi (reg, mem));
221 break;
222 default:
223 gcc_unreachable ();
224 }
225 break;
226
4e10a5a7 227 case E_SImode:
8845cb37
AS
228 switch (mode)
229 {
4e10a5a7 230 case E_QImode:
8845cb37
AS
231 emit_insn (gen_zero_extendqisi2 (reg, mem));
232 break;
4e10a5a7 233 case E_HImode:
8845cb37
AS
234 {
235 rtx src = mem;
236 if (!BYTES_BIG_ENDIAN)
237 {
238 src = gen_reg_rtx (HImode);
239 emit_insn (gen_bswaphi2 (src, mem));
240 }
241 emit_insn (gen_zero_extendhisi2 (reg, src));
242 break;
243 }
4e10a5a7 244 case E_SImode:
8845cb37
AS
245 if (!BYTES_BIG_ENDIAN)
246 emit_insn (gen_bswapsi2 (reg, mem));
247 else
248 emit_insn (gen_movsi (reg, mem));
249 break;
4e10a5a7 250 case E_DImode:
8845cb37
AS
251 /* DImode is larger than the destination reg so is not expected. */
252 gcc_unreachable ();
253 break;
254 default:
255 gcc_unreachable ();
256 }
257 break;
9d36bd3b
AS
258
259 case E_QImode:
260 gcc_assert (mode == E_QImode);
261 emit_move_insn (reg, mem);
262 break;
ef4adf1f 263
8845cb37
AS
264 default:
265 gcc_unreachable ();
266 break;
267 }
268}
269
270/* Select the mode to be used for reading the next chunk of bytes
271 in the compare.
272
273 OFFSET is the current read offset from the beginning of the block.
274 BYTES is the number of bytes remaining to be read.
74f9986e 275 ALIGN is the minimum alignment of the memory blocks being compared in bytes. */
8845cb37
AS
276static machine_mode
277select_block_compare_mode (unsigned HOST_WIDE_INT offset,
278 unsigned HOST_WIDE_INT bytes,
74f9986e 279 unsigned HOST_WIDE_INT align)
8845cb37
AS
280{
281 /* First see if we can do a whole load unit
282 as that will be more efficient than a larger load + shift. */
283
284 /* If big, use biggest chunk.
285 If exactly chunk size, use that size.
286 If remainder can be done in one piece with shifting, do that.
287 Do largest chunk possible without violating alignment rules. */
288
289 /* The most we can read without potential page crossing. */
290 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
291
74f9986e
AS
292 /* If we have an LE target without ldbrx and word_mode is DImode,
293 then we must avoid using word_mode. */
294 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
295 && word_mode == DImode);
296
8845cb37
AS
297 if (word_mode_ok && bytes >= UNITS_PER_WORD)
298 return word_mode;
299 else if (bytes == GET_MODE_SIZE (SImode))
300 return SImode;
301 else if (bytes == GET_MODE_SIZE (HImode))
302 return HImode;
303 else if (bytes == GET_MODE_SIZE (QImode))
304 return QImode;
305 else if (bytes < GET_MODE_SIZE (SImode)
f7e94dfb 306 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
8845cb37
AS
307 && offset >= GET_MODE_SIZE (SImode) - bytes)
308 /* This matches the case were we have SImode and 3 bytes
309 and offset >= 1 and permits us to move back one and overlap
310 with the previous read, thus avoiding having to shift
311 unwanted bytes off of the input. */
312 return SImode;
313 else if (word_mode_ok && bytes < UNITS_PER_WORD
f7e94dfb 314 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
8845cb37
AS
315 && offset >= UNITS_PER_WORD-bytes)
316 /* Similarly, if we can use DImode it will get matched here and
317 can do an overlapping read that ends at the end of the block. */
318 return word_mode;
319 else if (word_mode_ok && maxread >= UNITS_PER_WORD)
320 /* It is safe to do all remaining in one load of largest size,
321 possibly with a shift to get rid of unwanted bytes. */
322 return word_mode;
323 else if (maxread >= GET_MODE_SIZE (SImode))
324 /* It is safe to do all remaining in one SImode load,
325 possibly with a shift to get rid of unwanted bytes. */
326 return SImode;
327 else if (bytes > GET_MODE_SIZE (SImode))
328 return SImode;
329 else if (bytes > GET_MODE_SIZE (HImode))
330 return HImode;
331
332 /* final fallback is do one byte */
333 return QImode;
334}
335
336/* Compute the alignment of pointer+OFFSET where the original alignment
337 of pointer was BASE_ALIGN. */
338static unsigned HOST_WIDE_INT
339compute_current_alignment (unsigned HOST_WIDE_INT base_align,
340 unsigned HOST_WIDE_INT offset)
341{
342 if (offset == 0)
343 return base_align;
344 return MIN (base_align, offset & -offset);
345}
346
5ec3397e
AS
347/* Prepare address and then do a load.
348
349 MODE is the mode to use for the load.
350 DEST is the destination register for the data.
351 ADDR is the address to be loaded.
352 ORIG_ADDR is the original address expression. */
353static void
354do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
355 rtx orig_addr)
356{
357 rtx mem = gen_rtx_MEM (mode, addr);
358 MEM_COPY_ATTRIBUTES (mem, orig_addr);
359 set_mem_size (mem, GET_MODE_SIZE (mode));
360 do_load_for_compare (dest, mem, mode);
361 return;
362}
363
364/* Do a branch for an if/else decision.
365
366 CMPMODE is the mode to use for the comparison.
367 COMPARISON is the rtx code for the compare needed.
368 A is the first thing to be compared.
369 B is the second thing to be compared.
370 CR is the condition code reg input, or NULL_RTX.
371 TRUE_LABEL is the label to branch to if the condition is true.
372
373 The return value is the CR used for the comparison.
374 If CR is null_rtx, then a new register of CMPMODE is generated.
375 If A and B are both null_rtx, then CR must not be null, and the
376 compare is not generated so you can use this with a dot form insn. */
377
378static void
379do_ifelse (machine_mode cmpmode, rtx_code comparison,
380 rtx a, rtx b, rtx cr, rtx true_label)
381{
382 gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
383 || (a != NULL_RTX && b != NULL_RTX));
384
385 if (cr != NULL_RTX)
386 gcc_assert (GET_MODE (cr) == cmpmode);
387 else
388 cr = gen_reg_rtx (cmpmode);
389
390 rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
391
392 if (a != NULL_RTX)
393 emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
394
395 rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
396
397 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
398 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
399 JUMP_LABEL (j) = true_label;
400 LABEL_NUSES (true_label) += 1;
401}
402
403/* Emit an isel of the proper mode for DEST.
404
405 DEST is the isel destination register.
406 SRC1 is the isel source if CR is true.
407 SRC2 is the isel source if CR is false.
408 CR is the condition for the isel. */
409static void
410do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
411{
412 if (GET_MODE (dest) == DImode)
413 emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
414 else
415 emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
416}
417
418/* Emit a subtract of the proper mode for DEST.
419
420 DEST is the destination register for the subtract.
421 SRC1 is the first subtract input.
422 SRC2 is the second subtract input.
423
424 Computes DEST = SRC1-SRC2. */
425static void
426do_sub3 (rtx dest, rtx src1, rtx src2)
427{
428 if (GET_MODE (dest) == DImode)
429 emit_insn (gen_subdi3 (dest, src1, src2));
430 else
431 emit_insn (gen_subsi3 (dest, src1, src2));
432}
433
434/* Emit an add of the proper mode for DEST.
435
436 DEST is the destination register for the add.
437 SRC1 is the first add input.
438 SRC2 is the second add input.
439
440 Computes DEST = SRC1+SRC2. */
441static void
442do_add3 (rtx dest, rtx src1, rtx src2)
443{
444 if (GET_MODE (dest) == DImode)
445 emit_insn (gen_adddi3 (dest, src1, src2));
446 else
447 emit_insn (gen_addsi3 (dest, src1, src2));
448}
449
f7e94dfb
AS
450/* Emit an and of the proper mode for DEST.
451
452 DEST is the destination register for the and.
453 SRC1 is the first and input.
454 SRC2 is the second and input.
455
456 Computes DEST = SRC1&SRC2. */
457static void
458do_and3 (rtx dest, rtx src1, rtx src2)
459{
460 if (GET_MODE (dest) == DImode)
461 emit_insn (gen_anddi3 (dest, src1, src2));
462 else
463 emit_insn (gen_andsi3 (dest, src1, src2));
464}
465
466/* Emit an cmpb of the proper mode for DEST.
467
468 DEST is the destination register for the cmpb.
469 SRC1 is the first input.
470 SRC2 is the second input.
471
472 Computes cmpb of SRC1, SRC2. */
473static void
474do_cmpb3 (rtx dest, rtx src1, rtx src2)
475{
476 if (GET_MODE (dest) == DImode)
477 emit_insn (gen_cmpbdi3 (dest, src1, src2));
478 else
479 emit_insn (gen_cmpbsi3 (dest, src1, src2));
480}
481
482/* Emit a rotl of the proper mode for DEST.
483
484 DEST is the destination register for the and.
485 SRC1 is the first and input.
486 SRC2 is the second and input.
487
488 Computes DEST = SRC1 rotated left by SRC2. */
489static void
490do_rotl3 (rtx dest, rtx src1, rtx src2)
491{
492 if (GET_MODE (dest) == DImode)
493 emit_insn (gen_rotldi3 (dest, src1, src2));
494 else
495 emit_insn (gen_rotlsi3 (dest, src1, src2));
496}
497
5ec3397e
AS
498/* Generate rtl for a load, shift, and compare of less than a full word.
499
500 LOAD_MODE is the machine mode for the loads.
501 DIFF is the reg for the difference.
502 CMP_REM is the reg containing the remaining bytes to compare.
503 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
504 SRC1_ADDR is the first source address.
505 SRC2_ADDR is the second source address.
506 ORIG_SRC1 is the original first source block's address rtx.
507 ORIG_SRC2 is the original second source block's address rtx. */
508static void
509do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
510 rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
511{
512 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
513 rtx shift_amount = gen_reg_rtx (word_mode);
514 rtx d1 = gen_reg_rtx (word_mode);
515 rtx d2 = gen_reg_rtx (word_mode);
516
517 do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
518 do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
519 do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
520
521 if (word_mode == DImode)
522 {
523 emit_insn (gen_ashldi3 (shift_amount, shift_amount,
524 GEN_INT (LOG2_BITS_PER_UNIT)));
525 emit_insn (gen_lshrdi3 (d1, d1,
526 gen_lowpart (SImode, shift_amount)));
527 emit_insn (gen_lshrdi3 (d2, d2,
528 gen_lowpart (SImode, shift_amount)));
529 }
530 else
531 {
532 emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
533 GEN_INT (LOG2_BITS_PER_UNIT)));
534 emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
535 emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
536 }
537
538 if (TARGET_P9_MISC)
539 {
540 /* Generate a compare, and convert with a setb later. */
541 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
542 emit_insn (gen_rtx_SET (dcond, cmp));
543 }
544 else
545 {
546 if (word_mode == DImode)
547 emit_insn (gen_subfdi3_carry (diff, d2, d1));
548 else
549 emit_insn (gen_subfsi3_carry (diff, d2, d1));
550 }
551}
552
553/* Generate rtl for an overlapping load and compare of less than a
554 full load_mode. This assumes that the previous word is part of the
555 block being compared so it's ok to back up part of a word so we can
556 compare the last unaligned full word that ends at the end of the block.
557
558 LOAD_MODE is the machine mode for the loads.
559 ISCONST tells whether the remaining length is a constant or in a register.
560 BYTES_REM is the remaining length if ISCONST is true.
561 DIFF is the reg for the difference.
562 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
563 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
564 SRC1_ADDR is the first source address.
565 SRC2_ADDR is the second source address.
566 ORIG_SRC1 is the original first source block's address rtx.
567 ORIG_SRC2 is the original second source block's address rtx. */
568static void
569do_overlap_load_compare (machine_mode load_mode, bool isConst,
570 HOST_WIDE_INT bytes_rem, rtx diff,
571 rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
572 rtx orig_src1, rtx orig_src2)
573{
574 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
575 HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
576 rtx d1 = gen_reg_rtx (word_mode);
577 rtx d2 = gen_reg_rtx (word_mode);
578
579 rtx addr1, addr2;
580 if (!isConst || addr_adj)
581 {
582 rtx adj_reg = gen_reg_rtx (word_mode);
583 if (isConst)
584 emit_move_insn (adj_reg, GEN_INT (-addr_adj));
585 else
586 {
587 rtx reg_lms = gen_reg_rtx (word_mode);
588 emit_move_insn (reg_lms, GEN_INT (load_mode_size));
589 do_sub3 (adj_reg, cmp_rem, reg_lms);
590 }
591
592 addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
593 addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
594 }
595 else
596 {
597 addr1 = src1_addr;
598 addr2 = src2_addr;
599 }
600
601 do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
602 do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
603
604 if (TARGET_P9_MISC)
605 {
606 /* Generate a compare, and convert with a setb later. */
607 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
608 emit_insn (gen_rtx_SET (dcond, cmp));
609 }
610 else
611 {
612 if (word_mode == DImode)
613 emit_insn (gen_subfdi3_carry (diff, d2, d1));
614 else
615 emit_insn (gen_subfsi3_carry (diff, d2, d1));
616 }
617}
618
37ae4739
AS
619/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
620 instructions.
621
622 BYTES_TO_COMPARE is the number of bytes to be compared.
623 ORIG_SRC1 is the unmodified rtx for the first string.
624 ORIG_SRC2 is the unmodified rtx for the second string.
625 S1ADDR is the register to use for the base address of the first string.
626 S2ADDR is the register to use for the base address of the second string.
627 OFF_REG is the register to use for the string offset for loads.
628 S1DATA is the register for loading the first string.
629 S2DATA is the register for loading the second string.
630 VEC_RESULT is the rtx for the vector result indicating the byte difference.
631 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
632 to strcmp/strncmp if we have equality at the end of the inline comparison.
633 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
634 to clean up and generate the final comparison result.
635 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
636 set the final result.
637 CHECKZERO indicates whether the sequence should check for zero bytes
638 for use doing strncmp, or not (for use doing memcmp). */
639static void
640expand_cmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
641 rtx orig_src1, rtx orig_src2,
642 rtx s1addr, rtx s2addr, rtx off_reg,
643 rtx s1data, rtx s2data, rtx vec_result,
644 bool equality_compare_rest, rtx *p_cleanup_label,
645 rtx final_move_label, bool checkzero)
646{
647 machine_mode load_mode;
648 unsigned int load_mode_size;
649 unsigned HOST_WIDE_INT cmp_bytes = 0;
650 unsigned HOST_WIDE_INT offset = 0;
651 rtx zero_reg = NULL;
652
653 gcc_assert (p_cleanup_label != NULL);
654 rtx cleanup_label = *p_cleanup_label;
655
656 emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0)));
657 emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0)));
658
659 if (checkzero && !TARGET_P9_VECTOR)
660 {
661 zero_reg = gen_reg_rtx (V16QImode);
662 emit_move_insn (zero_reg, CONST0_RTX (V16QImode));
663 }
664
665 while (bytes_to_compare > 0)
666 {
667 /* VEC/VSX compare sequence for P8:
668 check each 16B with:
669 lxvd2x 32,28,8
670 lxvd2x 33,29,8
671 vcmpequb 2,0,1 # compare strings
672 vcmpequb 4,0,3 # compare w/ 0
673 xxlorc 37,36,34 # first FF byte is either mismatch or end of string
674 vcmpequb. 7,5,3 # reg 7 contains 0
675 bnl 6,.Lmismatch
676
677 For the P8 LE case, we use lxvd2x and compare full 16 bytes
678 but then use use vgbbd and a shift to get two bytes with the
679 information we need in the correct order.
680
681 VEC/VSX compare sequence if TARGET_P9_VECTOR:
682 lxvb16x/lxvb16x # load 16B of each string
683 vcmpnezb. # produces difference location or zero byte location
684 bne 6,.Lmismatch
685
686 Use the overlapping compare trick for the last block if it is
687 less than 16 bytes.
688 */
689
690 load_mode = V16QImode;
691 load_mode_size = GET_MODE_SIZE (load_mode);
692
693 if (bytes_to_compare >= load_mode_size)
694 cmp_bytes = load_mode_size;
695 else
696 {
697 /* Move this load back so it doesn't go past the end. P8/P9
698 can do this efficiently. This is never called with less
699 than 16 bytes so we should always be able to do this. */
700 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
701 cmp_bytes = bytes_to_compare;
702 gcc_assert (offset > extra_bytes);
703 offset -= extra_bytes;
704 cmp_bytes = load_mode_size;
705 bytes_to_compare = cmp_bytes;
706 }
707
708 /* The offset currently used is always kept in off_reg so that the
709 cleanup code on P8 can use it to extract the differing byte. */
710 emit_move_insn (off_reg, GEN_INT (offset));
711
712 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
713 do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1);
714 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
715 do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2);
716
717 /* Cases to handle. A and B are chunks of the two strings.
718 1: Not end of comparison:
719 A != B: branch to cleanup code to compute result.
720 A == B: next block
721 2: End of the inline comparison:
722 A != B: branch to cleanup code to compute result.
723 A == B: call strcmp/strncmp
724 3: compared requested N bytes:
725 A == B: branch to result 0.
726 A != B: cleanup code to compute result. */
727
728 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
729
730 if (checkzero)
731 {
732 if (TARGET_P9_VECTOR)
733 emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data));
734 else
735 {
736 /* Emit instructions to do comparison and zero check. */
737 rtx cmp_res = gen_reg_rtx (load_mode);
738 rtx cmp_zero = gen_reg_rtx (load_mode);
739 rtx cmp_combined = gen_reg_rtx (load_mode);
740 emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data));
741 emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg));
742 emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res));
743 emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg));
744 }
745 }
746 else
747 emit_insn (gen_altivec_vcmpequb_p (vec_result, s1data, s2data));
748
749 bool branch_to_cleanup = (remain > 0 || equality_compare_rest);
750 rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO);
751 rtx dst_label;
752 rtx cmp_rtx;
753 if (branch_to_cleanup)
754 {
755 /* Branch to cleanup code, otherwise fall through to do more
756 compares. P8 and P9 use different CR bits because on P8
757 we are looking at the result of a comparsion vs a
758 register of zeroes so the all-true condition means no
759 difference or zero was found. On P9, vcmpnezb sets a byte
760 to 0xff if there is a mismatch or zero, so the all-false
761 condition indicates we found no difference or zero. */
762 if (!cleanup_label)
763 cleanup_label = gen_label_rtx ();
764 dst_label = cleanup_label;
765 if (TARGET_P9_VECTOR && checkzero)
766 cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx);
767 else
768 cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx);
769 }
770 else
771 {
772 /* Branch to final return or fall through to cleanup,
773 result is already set to 0. */
774 dst_label = final_move_label;
775 if (TARGET_P9_VECTOR && checkzero)
776 cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx);
777 else
778 cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx);
779 }
780
781 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
782 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
783 lab_ref, pc_rtx);
784 rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
785 JUMP_LABEL (j2) = dst_label;
786 LABEL_NUSES (dst_label) += 1;
787
788 offset += cmp_bytes;
789 bytes_to_compare -= cmp_bytes;
790 }
791 *p_cleanup_label = cleanup_label;
792 return;
793}
794
795/* Generate the final sequence that identifies the differing
796 byte and generates the final result, taking into account
797 zero bytes:
798
799 P8:
800 vgbbd 0,0
801 vsldoi 0,0,0,9
802 mfvsrd 9,32
803 addi 10,9,-1 # count trailing zero bits
804 andc 9,10,9
805 popcntd 9,9
806 lbzx 10,28,9 # use that offset to load differing byte
807 lbzx 3,29,9
808 subf 3,3,10 # subtract for final result
809
810 P9:
811 vclzlsbb # counts trailing bytes with lsb=0
812 vextublx # extract differing byte
813
814 STR1 is the reg rtx for data from string 1.
815 STR2 is the reg rtx for data from string 2.
816 RESULT is the reg rtx for the comparison result.
817 S1ADDR is the register to use for the base address of the first string.
818 S2ADDR is the register to use for the base address of the second string.
819 ORIG_SRC1 is the unmodified rtx for the first string.
820 ORIG_SRC2 is the unmodified rtx for the second string.
821 OFF_REG is the register to use for the string offset for loads.
822 VEC_RESULT is the rtx for the vector result indicating the byte difference. */
823
824static void
825emit_final_compare_vec (rtx str1, rtx str2, rtx result,
826 rtx s1addr, rtx s2addr,
827 rtx orig_src1, rtx orig_src2,
828 rtx off_reg, rtx vec_result)
829{
830
831 if (TARGET_P9_VECTOR)
832 {
833 rtx diffix = gen_reg_rtx (SImode);
834 rtx chr1 = gen_reg_rtx (SImode);
835 rtx chr2 = gen_reg_rtx (SImode);
836 rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0);
837 rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0);
838 emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result));
839 emit_insn (gen_vextublx (chr1, diffix, str1));
840 emit_insn (gen_vextublx (chr2, diffix, str2));
841 do_sub3 (result, chr1_di, chr2_di);
842 }
843 else
844 {
845 gcc_assert (TARGET_P8_VECTOR);
846 rtx diffix = gen_reg_rtx (DImode);
847 rtx result_gbbd = gen_reg_rtx (V16QImode);
848 /* Since each byte of the input is either 00 or FF, the bytes in
849 dw0 and dw1 after vgbbd are all identical to each other. */
850 emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result));
851 /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
852 For BE, we shift by 7 and get AB in the high two bytes then CLZ. */
853 rtx result_shifted = gen_reg_rtx (V16QImode);
854 int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9;
855 emit_insn (gen_altivec_vsldoi_v16qi (result_shifted, result_gbbd,
856 result_gbbd, GEN_INT (shift_amt)));
857
858 rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0);
859 emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted));
860 rtx count = gen_reg_rtx (DImode);
861
862 if (BYTES_BIG_ENDIAN)
863 emit_insn (gen_clzdi2 (count, diffix));
864 else
865 emit_insn (gen_ctzdi2 (count, diffix));
866
867 /* P8 doesn't have a good solution for extracting one byte from
868 a vsx reg like vextublx on P9 so we just compute the offset
869 of the differing byte and load it from each string. */
870 do_add3 (off_reg, off_reg, count);
871
872 rtx chr1 = gen_reg_rtx (QImode);
873 rtx chr2 = gen_reg_rtx (QImode);
874 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
875 do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1);
876 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
877 do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2);
878 machine_mode rmode = GET_MODE (result);
879 rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0);
880 rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0);
881 do_sub3 (result, chr1_rm, chr2_rm);
882 }
883
884 return;
885}
886
5ec3397e
AS
887/* Expand a block compare operation using loop code, and return true
888 if successful. Return false if we should let the compiler generate
889 normal code, probably a memcmp call.
890
891 OPERANDS[0] is the target (result).
892 OPERANDS[1] is the first source.
893 OPERANDS[2] is the second source.
894 OPERANDS[3] is the length.
895 OPERANDS[4] is the alignment. */
896bool
897expand_compare_loop (rtx operands[])
898{
899 rtx target = operands[0];
900 rtx orig_src1 = operands[1];
901 rtx orig_src2 = operands[2];
902 rtx bytes_rtx = operands[3];
903 rtx align_rtx = operands[4];
904
905 /* This case is complicated to handle because the subtract
906 with carry instructions do not generate the 64-bit
907 carry and so we must emit code to calculate it ourselves.
908 We choose not to implement this yet. */
909 if (TARGET_32BIT && TARGET_POWERPC64)
910 return false;
911
912 /* Allow non-const length. */
913 int bytes_is_const = CONST_INT_P (bytes_rtx);
914
915 /* This must be a fixed size alignment. */
916 if (!CONST_INT_P (align_rtx))
917 return false;
918
919 HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
920 HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
921 HOST_WIDE_INT minalign = MIN (align1, align2);
922
923 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
924
925 gcc_assert (GET_MODE (target) == SImode);
926
927 /* Anything to move? */
928 HOST_WIDE_INT bytes = 0;
929 if (bytes_is_const)
930 bytes = INTVAL (bytes_rtx);
931
932 if (bytes_is_const && bytes == 0)
933 return true;
934
935 /* Limit the amount we compare, if known statically. */
936 HOST_WIDE_INT max_bytes;
937 switch (rs6000_tune)
938 {
939 case PROCESSOR_POWER7:
940 if (!bytes_is_const)
941 if (minalign < 8)
942 max_bytes = 0;
943 else
944 max_bytes = 128;
945 else
946 if (minalign < 8)
947 max_bytes = 32;
948 else
949 max_bytes = 128;
950 break;
951 case PROCESSOR_POWER8:
952 if (!bytes_is_const)
953 max_bytes = 0;
954 else
955 if (minalign < 8)
956 max_bytes = 128;
957 else
958 max_bytes = 64;
959 break;
960 case PROCESSOR_POWER9:
961 if (bytes_is_const)
962 max_bytes = 191;
963 else
964 max_bytes = 0;
965 break;
966 default:
967 max_bytes = 128;
968 }
969
970 /* Allow the option to override the default. */
971 if (rs6000_block_compare_inline_loop_limit >= 0)
972 max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
973
974 if (max_bytes == 0)
975 return false;
976
977 rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */
978 rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */
979 HOST_WIDE_INT niter;
980 rtx iter = gen_reg_rtx (word_mode);
981 rtx iv1 = gen_reg_rtx (word_mode);
982 rtx iv2 = gen_reg_rtx (word_mode);
983 rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */
984 rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */
985 rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */
986 rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */
987
988 /* Strip unneeded subreg from length if there is one. */
989 if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
990 bytes_rtx = SUBREG_REG (bytes_rtx);
991 /* Extend bytes_rtx to word_mode if needed. But, we expect only to
992 maybe have to deal with the case were bytes_rtx is SImode and
993 word_mode is DImode. */
994 if (!bytes_is_const)
995 {
996 if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
997 /* Do not expect length longer than word_mode. */
ef4adf1f 998 return false;
5ec3397e
AS
999 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
1000 {
1001 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
1002 bytes_rtx = force_reg (word_mode,
1003 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
1004 bytes_rtx));
1005 }
1006 else
1007 /* Make sure it's in a register before we get started. */
1008 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
1009 }
1010
1011 machine_mode load_mode = word_mode;
1012 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
1013
1014 /* Number of bytes per iteration of the unrolled loop. */
1015 HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
1016 /* max iters and bytes compared in the loop. */
1017 HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
1018 HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
1019 int l2lb = floor_log2 (loop_bytes);
1020
1021 if (bytes_is_const && (max_bytes < load_mode_size
1022 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
1023 return false;
1024
1025 bool no_remainder_code = false;
1026 rtx final_label = gen_label_rtx ();
1027 rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1028 rtx diff_label = gen_label_rtx ();
1029 rtx library_call_label = NULL;
1030 rtx cleanup_label = gen_label_rtx ();
1031
1032 rtx cr;
1033
1034 rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
1035 rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
1036
1037 /* Difference found is stored here before jump to diff_label. */
1038 rtx diff = gen_reg_rtx (word_mode);
1039 rtx j;
1040
1041 /* Example of generated code for 35 bytes aligned 1 byte.
ef4adf1f 1042
5ec3397e
AS
1043 mtctr 8
1044 li 6,0
1045 li 5,8
1046 .L13:
1047 ldbrx 7,3,6
1048 ldbrx 9,10,6
1049 ldbrx 0,3,5
1050 ldbrx 4,10,5
1051 addi 6,6,16
1052 addi 5,5,16
1053 subfc. 9,9,7
1054 bne 0,.L10
1055 subfc. 9,4,0
1056 bdnzt 2,.L13
1057 bne 0,.L10
1058 add 3,3,6
1059 add 10,10,6
1060 addi 9,3,-5
1061 ldbrx 7,0,9
1062 addi 9,10,-5
1063 ldbrx 9,0,9
1064 subfc 9,9,7
1065 .p2align 4,,15
1066 .L10:
1067 popcntd 9,9
1068 subfe 10,10,10
1069 or 9,9,10
ef4adf1f 1070
5ec3397e
AS
1071 Compiled with -fno-reorder-blocks for clarity. */
1072
1073 /* Structure of what we're going to do:
1074 Two separate lengths: what we will compare before bailing to library
1075 call (max_bytes), and the total length to be checked.
1076 if length <= 16, branch to linear cleanup code starting with
1077 remainder length check (length not known at compile time)
1078 set up 2 iv's and load count reg, compute remainder length
1079 unrollx2 compare loop
1080 if loop exit due to a difference, branch to difference handling code
1081 if remainder length < 8, branch to final cleanup compare
1082 load and compare 8B
1083 final cleanup comparison (depends on alignment and length)
1084 load 8B, shift off bytes past length, compare
1085 load 8B ending at last byte and compare
1086 load/compare 1 byte at a time (short block abutting 4k boundary)
1087 difference handling, 64->32 conversion
1088 final result
1089 branch around memcmp call
1090 memcmp library call
1091 */
1092
1093 /* If bytes is not const, compare length and branch directly
1094 to the cleanup code that can handle 0-16 bytes if length
1095 is >= 16. Stash away bytes-max_bytes for the library call. */
1096 if (bytes_is_const)
1097 {
1098 /* These need to be set for some of the places we may jump to. */
1099 if (bytes > max_bytes)
1100 {
1101 no_remainder_code = true;
1102 niter = max_loop_iter;
1103 library_call_label = gen_label_rtx ();
1104 }
1105 else
1106 {
1107 niter = bytes / loop_bytes;
1108 }
1109 emit_move_insn (iter, GEN_INT (niter));
1110 emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
1111 emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
1112 }
1113 else
1114 {
1115 library_call_label = gen_label_rtx ();
1116
1117 /* If we go to the cleanup code, it expects length to be in cmp_rem. */
1118 emit_move_insn (cmp_rem, bytes_rtx);
1119
1120 /* Check for > max_bytes bytes. We want to bail out as quickly as
1121 possible if we have to go over to memcmp. */
1122 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
1123 NULL_RTX, library_call_label);
1124
1125 /* Check for < loop_bytes bytes. */
1126 do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
1127 NULL_RTX, cleanup_label);
1128
1129 /* Loop compare bytes and iterations if bytes>max_bytes. */
1130 rtx mb_reg = gen_reg_rtx (word_mode);
1131 emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
1132 rtx mi_reg = gen_reg_rtx (word_mode);
1133 emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
1134
1135 /* Compute number of loop iterations if bytes <= max_bytes. */
1136 if (word_mode == DImode)
1137 emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
1138 else
1139 emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
1140
1141 /* Compute bytes to compare in loop if bytes <= max_bytes. */
1142 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
1143 if (word_mode == DImode)
1144 {
1145 emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
1146 }
1147 else
1148 {
1149 emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
1150 }
1151
1152 /* Check for bytes <= max_bytes. */
1153 if (TARGET_ISEL)
1154 {
1155 /* P9 has fast isel so we use one compare and two isel. */
1156 cr = gen_reg_rtx (CCmode);
1157 rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
1158 GEN_INT (max_bytes));
1159 emit_move_insn (cr, compare_rtx);
1160 rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
1161 do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
1162 do_isel (iter, cmp_rtx, iter, mi_reg, cr);
1163 }
1164 else
1165 {
1166 rtx lab_after = gen_label_rtx ();
1167 do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
1168 NULL_RTX, lab_after);
1169 emit_move_insn (loop_cmp, mb_reg);
1170 emit_move_insn (iter, mi_reg);
1171 emit_label (lab_after);
1172 }
1173
1174 /* Now compute remainder bytes which isn't used until after the loop. */
1175 do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
1176 }
1177
1178 rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */
1179 /* For p9 we need to have just one of these as multiple places define
1180 it and it gets used by the setb at the end. */
1181 if (TARGET_P9_MISC)
1182 dcond = gen_reg_rtx (CCUNSmode);
1183
1184 if (!bytes_is_const || bytes >= loop_bytes)
1185 {
1186 /* It should not be possible to come here if remaining bytes is
1187 < 16 in the runtime case either. Compute number of loop
1188 iterations. We compare 2*word_mode per iteration so 16B for
1189 64-bit code and 8B for 32-bit. Set up two induction
1190 variables and load count register. */
1191
1192 /* HACK ALERT: create hard reg for CTR here. If we just use a
1193 pseudo, cse will get rid of it and then the allocator will
1194 see it used in the lshr above and won't give us ctr. */
1195 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1196 emit_move_insn (ctr, iter);
1197 emit_move_insn (diff, GEN_INT (0));
1198 emit_move_insn (iv1, GEN_INT (0));
1199 emit_move_insn (iv2, GEN_INT (load_mode_size));
1200
1201 /* inner loop to compare 2*word_mode */
1202 rtx loop_top_label = gen_label_rtx ();
1203 emit_label (loop_top_label);
1204
1205 rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
1206 rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
1207
1208 do_load_for_compare_from_addr (load_mode, d1_1,
1209 src1_ix1, orig_src1);
1210 do_load_for_compare_from_addr (load_mode, d2_1,
1211 src2_ix1, orig_src2);
1212 do_add3 (iv1, iv1, GEN_INT (loop_bytes));
1213
1214 rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
1215 rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
1216
1217 do_load_for_compare_from_addr (load_mode, d1_2,
1218 src1_ix2, orig_src1);
1219 do_load_for_compare_from_addr (load_mode, d2_2,
1220 src2_ix2, orig_src2);
1221 do_add3 (iv2, iv2, GEN_INT (loop_bytes));
1222
1223 if (TARGET_P9_MISC)
1224 {
1225 /* Generate a compare, and convert with a setb later. */
1226 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1227 emit_insn (gen_rtx_SET (dcond, cmp));
1228 }
1229 else
1230 {
1231 dcond = gen_reg_rtx (CCmode);
1232 if (word_mode == DImode)
1233 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1234 else
1235 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1236 }
1237
1238 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1239 dcond, diff_label);
1240
1241 if (TARGET_P9_MISC)
1242 {
1243 /* Generate a compare, and convert with a setb later. */
1244 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
1245 emit_insn (gen_rtx_SET (dcond, cmp));
1246 }
1247 else
1248 {
1249 dcond = gen_reg_rtx (CCmode);
1250 if (word_mode == DImode)
1251 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
1252 else
1253 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
1254 }
1255
1256 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
1257 if (TARGET_64BIT)
1258 j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
1259 eqrtx, dcond));
1260 else
1261 j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
1262 eqrtx, dcond));
1263 JUMP_LABEL (j) = loop_top_label;
1264 LABEL_NUSES (loop_top_label) += 1;
1265 }
1266
1267 HOST_WIDE_INT bytes_remaining = 0;
1268 if (bytes_is_const)
1269 bytes_remaining = (bytes % loop_bytes);
1270
1271 /* If diff is nonzero, branch to difference handling
1272 code. If we exit here with a nonzero diff, it is
1273 because the second word differed. */
1274 if (TARGET_P9_MISC)
1275 do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, diff_label);
1276 else
1277 do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, diff_label);
1278
1279 if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
1280 {
1281 /* If the length is known at compile time, then we will always
1282 have a remainder to go to the library call with. */
1283 rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
1284 j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
1285 JUMP_LABEL (j) = library_call_label;
1286 LABEL_NUSES (library_call_label) += 1;
1287 emit_barrier ();
1288 }
1289
1290 if (bytes_is_const && bytes_remaining == 0)
1291 {
1292 /* No remainder and if we are here then diff is 0 so just return 0 */
1293 if (TARGET_64BIT)
1294 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1295 else
1296 emit_move_insn (target, diff);
1297 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1298 JUMP_LABEL (j) = final_label;
1299 LABEL_NUSES (final_label) += 1;
1300 emit_barrier ();
1301 }
1302 else if (!no_remainder_code)
1303 {
1304 /* Update addresses to point to the next word to examine. */
1305 do_add3 (src1_addr, src1_addr, iv1);
1306 do_add3 (src2_addr, src2_addr, iv1);
1307
1308 emit_label (cleanup_label);
1309
1310 if (!bytes_is_const)
1311 {
1312 /* If we're dealing with runtime length, we have to check if
ef4adf1f 1313 it's zero after the loop. When length is known at compile
5ec3397e
AS
1314 time the no-remainder condition is dealt with above. By
1315 doing this after cleanup_label, we also deal with the
1316 case where length is 0 at the start and we bypass the
1317 loop with a branch to cleanup_label. */
1318 emit_move_insn (target, const0_rtx);
1319 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1320 NULL_RTX, final_label);
1321 }
1322
1323 rtx final_cleanup = gen_label_rtx ();
1324 rtx cmp_rem_before = gen_reg_rtx (word_mode);
1325 /* Compare one more word_mode chunk if needed. */
37ca383f 1326 if (!bytes_is_const || bytes_remaining >= load_mode_size)
5ec3397e
AS
1327 {
1328 /* If remainder length < word length, branch to final
1329 cleanup compare. */
1330 if (!bytes_is_const)
1331 do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
1332 NULL_RTX, final_cleanup);
1333
1334 /* load and compare 8B */
1335 do_load_for_compare_from_addr (load_mode, d1_1,
1336 src1_addr, orig_src1);
1337 do_load_for_compare_from_addr (load_mode, d2_1,
1338 src2_addr, orig_src2);
1339
1340 /* Compare the word, see if we need to do the last partial. */
1341 if (TARGET_P9_MISC)
1342 {
1343 /* Generate a compare, and convert with a setb later. */
1344 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1345 emit_insn (gen_rtx_SET (dcond, cmp));
1346 }
1347 else
1348 {
1349 dcond = gen_reg_rtx (CCmode);
1350 if (word_mode == DImode)
1351 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1352 else
1353 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1354 }
1355
1356 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1357 dcond, diff_label);
1358
1359 do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1360 do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1361 emit_move_insn (cmp_rem_before, cmp_rem);
1362 do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1363 if (bytes_is_const)
1364 bytes_remaining -= load_mode_size;
1365 else
1366 /* See if remaining length is now zero. We previously set
1367 target to 0 so we can just jump to the end. */
1368 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1369 NULL_RTX, final_label);
1370
1371 }
1372
1373 /* Cases:
1374 bytes_is_const
1375 We can always shift back to do an overlapping compare
1376 of the last chunk because we know length >= 8.
1377
1378 !bytes_is_const
1379 align>=load_mode_size
1380 Read word_mode and mask
1381 align<load_mode_size
1382 avoid stepping past end
1383
1384 Three strategies:
1385 * decrement address and do overlapping compare
1386 * read word_mode and mask
1387 * carefully avoid crossing 4k boundary
1388 */
1389
1390 if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1391 && align1 >= load_mode_size && align2 >= load_mode_size)
1392 {
1393 /* Alignment is larger than word_mode so we do not need to be
1394 concerned with extra page crossings. But, we do not know
1395 that the length is larger than load_mode_size so we might
1396 end up compareing against data before the block if we try
1397 an overlapping compare. Also we use this on P7 for fixed length
1398 remainder because P7 doesn't like overlapping unaligned.
1399 Strategy: load 8B, shift off bytes past length, and compare. */
1400 emit_label (final_cleanup);
1401 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1402 src1_addr, src2_addr, orig_src1, orig_src2);
1403 }
1404 else if (bytes_remaining && bytes_is_const)
1405 {
1406 /* We do not do loop expand if length < 32 so we know at the
1407 end we can do an overlapping compare.
1408 Strategy: shift address back and do word_mode load that
1409 ends at the end of the block. */
1410 emit_label (final_cleanup);
1411 do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1412 cmp_rem, dcond, src1_addr, src2_addr,
1413 orig_src1, orig_src2);
1414 }
1415 else if (!bytes_is_const)
1416 {
1417 rtx handle4k_label = gen_label_rtx ();
1418 rtx nonconst_overlap = gen_label_rtx ();
1419 emit_label (nonconst_overlap);
1420
1421 /* Here we have to handle the case where whe have runtime
1422 length which may be too short for overlap compare, and
1423 alignment is not at least load_mode_size so we have to
1424 tread carefully to avoid stepping across 4k boundaries. */
1425
1426 /* If the length after the loop was larger than word_mode
1427 size, we can just do an overlapping compare and we're
1428 done. We fall through to this code from the word_mode
1429 compare that preceeds this. */
1430 do_overlap_load_compare (load_mode, false, 0, diff,
1431 cmp_rem, dcond, src1_addr, src2_addr,
1432 orig_src1, orig_src2);
1433
1434 rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1435 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1436 JUMP_LABEL (j) = diff_label;
1437 LABEL_NUSES (diff_label) += 1;
1438 emit_barrier ();
1439
1440 /* If we couldn't do the overlap compare we have to be more
1441 careful of the 4k boundary. Test to see if either
1442 address is less than word_mode_size away from a 4k
1443 boundary. If not, then we can do a load/shift/compare
1444 and we are done. We come to this code if length was less
1445 than word_mode_size. */
1446
1447 emit_label (final_cleanup);
1448
1449 /* We can still avoid the slow case if the length was larger
1450 than one loop iteration, in which case go do the overlap
1451 load compare path. */
1452 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
1453 NULL_RTX, nonconst_overlap);
1454
1455 rtx rem4k = gen_reg_rtx (word_mode);
1456 rtx dist1 = gen_reg_rtx (word_mode);
1457 rtx dist2 = gen_reg_rtx (word_mode);
1458 do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1459 if (word_mode == SImode)
1460 emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1461 else
1462 emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
1463 do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, handle4k_label);
1464 if (word_mode == SImode)
1465 emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1466 else
1467 emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
1468 do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, handle4k_label);
1469
1470 /* We don't have a 4k boundary to deal with, so do
1471 a load/shift/compare and jump to diff. */
1472
1473 do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1474 src1_addr, src2_addr, orig_src1, orig_src2);
1475
1476 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1477 JUMP_LABEL (j) = diff_label;
1478 LABEL_NUSES (diff_label) += 1;
1479 emit_barrier ();
1480
1481 /* Finally in the unlikely case we are inching up to a
1482 4k boundary we use a compact lbzx/compare loop to do
1483 it a byte at a time. */
1484
1485 emit_label (handle4k_label);
1486
1487 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1488 emit_move_insn (ctr, cmp_rem);
1489 rtx ixreg = gen_reg_rtx (Pmode);
1490 emit_move_insn (ixreg, const0_rtx);
1491
1492 rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1493 rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1494 rtx d1 = gen_reg_rtx (word_mode);
1495 rtx d2 = gen_reg_rtx (word_mode);
1496
1497 rtx fc_loop = gen_label_rtx ();
1498 emit_label (fc_loop);
1499
1500 do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1501 do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1502
1503 do_add3 (ixreg, ixreg, const1_rtx);
1504
1505 rtx cond = gen_reg_rtx (CCmode);
1506 rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1507 rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1508
1509 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1510 if (TARGET_64BIT)
1511 j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1512 eqrtx, cond));
1513 else
1514 j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1515 eqrtx, cond));
1516 JUMP_LABEL (j) = fc_loop;
1517 LABEL_NUSES (fc_loop) += 1;
1518
1519 if (TARGET_64BIT)
1520 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1521 else
1522 emit_move_insn (target, diff);
1523
1524 /* Since we are comparing bytes, the difference can be used
1525 as the final result and we are done here. */
1526 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1527 JUMP_LABEL (j) = final_label;
1528 LABEL_NUSES (final_label) += 1;
1529 emit_barrier ();
1530 }
1531 }
1532
1533 emit_label (diff_label);
1534 /* difference handling, 64->32 conversion */
1535
1536 /* We need to produce DI result from sub, then convert to target SI
1537 while maintaining <0 / ==0 / >0 properties. This sequence works:
1538 subfc L,A,B
1539 subfe H,H,H
1540 popcntd L,L
1541 rldimi L,H,6,0
1542
1543 This is an alternate one Segher cooked up if somebody
1544 wants to expand this for something that doesn't have popcntd:
1545 subfc L,a,b
1546 subfe H,x,x
1547 addic t,L,-1
1548 subfe v,t,L
1549 or z,v,H
1550
1551 And finally, p9 can just do this:
1552 cmpld A,B
1553 setb r */
1554
1555 if (TARGET_P9_MISC)
1556 emit_insn (gen_setb_unsigned (target, dcond));
1557 else
1558 {
1559 if (TARGET_64BIT)
1560 {
1561 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1562 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1563 emit_insn (gen_popcntddi2 (diff, diff));
1564 emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1565 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1566 }
1567 else
1568 {
1569 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1570 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1571 emit_insn (gen_popcntdsi2 (diff, diff));
1572 emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1573 }
1574 }
1575
1576 if (library_call_label != NULL)
1577 {
1578 /* Branch around memcmp call. */
1579 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1580 JUMP_LABEL (j) = final_label;
1581 LABEL_NUSES (final_label) += 1;
1582 emit_barrier ();
1583
1584 /* Make memcmp library call. cmp_rem is the remaining bytes that
1585 were compared and cmp_rem is the expected amount to be compared
1586 by memcmp. If we don't find a difference in the loop compare, do
1587 the library call directly instead of doing a small compare just
1588 to get to an arbitrary boundary before calling it anyway.
1589 Also, update addresses to point to the next word to examine. */
1590 emit_label (library_call_label);
1591
1592 rtx len_rtx = gen_reg_rtx (word_mode);
1593 if (bytes_is_const)
1594 {
1595 emit_move_insn (len_rtx, cmp_rem);
1596 do_add3 (src1_addr, src1_addr, iv1);
1597 do_add3 (src2_addr, src2_addr, iv1);
1598 }
1599 else
1600 emit_move_insn (len_rtx, bytes_rtx);
1601
1602 tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1603 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1604 target, LCT_NORMAL, GET_MODE (target),
1605 src1_addr, Pmode,
1606 src2_addr, Pmode,
1607 len_rtx, GET_MODE (len_rtx));
1608 }
1609
1610 /* emit final_label */
1611 emit_label (final_label);
1612 return true;
1613}
1614
37ae4739
AS
1615/* Generate code to convert a DImode-plus-carry subtract result into
1616 a SImode result that has the same <0 / ==0 / >0 properties to
1617 produce the final result from memcmp.
8845cb37 1618
37ae4739
AS
1619 TARGET is the rtx for the register to receive the memcmp result.
1620 SUB_RESULT is the rtx for the register contining the subtract result. */
8845cb37 1621
37ae4739
AS
1622void
1623generate_6432_conversion(rtx target, rtx sub_result)
1624{
1625 /* We need to produce DI result from sub, then convert to target SI
1626 while maintaining <0 / ==0 / >0 properties. This sequence works:
1627 subfc L,A,B
1628 subfe H,H,H
1629 popcntd L,L
1630 rldimi L,H,6,0
8845cb37 1631
37ae4739
AS
1632 This is an alternate one Segher cooked up if somebody
1633 wants to expand this for something that doesn't have popcntd:
1634 subfc L,a,b
1635 subfe H,x,x
1636 addic t,L,-1
1637 subfe v,t,L
1638 or z,v,H
8845cb37 1639
37ae4739
AS
1640 And finally, p9 can just do this:
1641 cmpld A,B
1642 setb r */
8845cb37 1643
37ae4739
AS
1644 if (TARGET_64BIT)
1645 {
1646 rtx tmp_reg_ca = gen_reg_rtx (DImode);
1647 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1648 rtx popcnt = gen_reg_rtx (DImode);
1649 emit_insn (gen_popcntddi2 (popcnt, sub_result));
1650 rtx tmp2 = gen_reg_rtx (DImode);
1651 emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca));
1652 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2)));
1653 }
8845cb37 1654 else
37ae4739
AS
1655 {
1656 rtx tmp_reg_ca = gen_reg_rtx (SImode);
1657 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1658 rtx popcnt = gen_reg_rtx (SImode);
1659 emit_insn (gen_popcntdsi2 (popcnt, sub_result));
1660 emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca));
1661 }
1662}
8845cb37 1663
37ae4739
AS
1664/* Generate memcmp expansion using in-line non-loop GPR instructions.
1665 The bool return indicates whether code for a 64->32 conversion
1666 should be generated.
1667
1668 BYTES is the number of bytes to be compared.
1669 BASE_ALIGN is the minimum alignment for both blocks to compare.
1670 ORIG_SRC1 is the original pointer to the first block to compare.
1671 ORIG_SRC2 is the original pointer to the second block to compare.
1672 SUB_RESULT is the reg rtx for the result from the final subtract.
1673 COND is rtx for a condition register that will be used for the final
1674 compare on power9 or better.
1675 FINAL_RESULT is the reg rtx for the final memcmp result.
1676 P_CONVERT_LABEL is a pointer to rtx that will be used to store the
1677 label generated for a branch to the 64->32 code, if such a branch
1678 is needed.
1679 P_FINAL_LABEL is a pointer to rtx that will be used to store the label
1680 for the end of the memcmp if a branch there is needed.
1681*/
8845cb37 1682
37ae4739
AS
1683bool
1684expand_block_compare_gpr(unsigned HOST_WIDE_INT bytes, unsigned int base_align,
1685 rtx orig_src1, rtx orig_src2,
1686 rtx sub_result, rtx cond, rtx final_result,
1687 rtx *p_convert_label, rtx *p_final_label)
1688{
8845cb37
AS
1689 /* Example of generated code for 18 bytes aligned 1 byte.
1690 Compiled with -fno-reorder-blocks for clarity.
1691 ldbrx 10,31,8
1692 ldbrx 9,7,8
1693 subfc. 9,9,10
1694 bne 0,.L6487
1695 addi 9,12,8
1696 addi 5,11,8
1697 ldbrx 10,0,9
1698 ldbrx 9,0,5
1699 subfc. 9,9,10
1700 bne 0,.L6487
1701 addi 9,12,16
1702 lhbrx 10,0,9
1703 addi 9,11,16
1704 lhbrx 9,0,9
1705 subf 9,9,10
1706 b .L6488
1707 .p2align 4,,15
1708 .L6487: #convert_label
1709 popcntd 9,9
1710 subfe 10,10,10
1711 or 9,9,10
1712 .L6488: #final_label
1713 extsw 10,9
1714
1715 We start off with DImode for two blocks that jump to the DI->SI conversion
1716 if the difference is found there, then a final block of HImode that skips
1717 the DI->SI conversion. */
1718
37ae4739
AS
1719 unsigned HOST_WIDE_INT offset = 0;
1720 unsigned int load_mode_size;
1721 HOST_WIDE_INT cmp_bytes = 0;
1722 rtx src1 = orig_src1;
1723 rtx src2 = orig_src2;
1724 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1725 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1726 bool need_6432_conv = false;
1727 rtx convert_label = NULL;
1728 rtx final_label = NULL;
1729 machine_mode load_mode;
1730
8845cb37
AS
1731 while (bytes > 0)
1732 {
1733 unsigned int align = compute_current_alignment (base_align, offset);
74f9986e 1734 load_mode = select_block_compare_mode (offset, bytes, align);
8845cb37
AS
1735 load_mode_size = GET_MODE_SIZE (load_mode);
1736 if (bytes >= load_mode_size)
1737 cmp_bytes = load_mode_size;
1738 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1739 {
1740 /* Move this load back so it doesn't go past the end.
1741 P8/P9 can do this efficiently. */
1742 unsigned int extra_bytes = load_mode_size - bytes;
1743 cmp_bytes = bytes;
1744 if (extra_bytes < offset)
1745 {
1746 offset -= extra_bytes;
1747 cmp_bytes = load_mode_size;
1748 bytes = cmp_bytes;
1749 }
1750 }
1751 else
1752 /* P7 and earlier can't do the overlapping load trick fast,
1753 so this forces a non-overlapping load and a shift to get
1754 rid of the extra bytes. */
1755 cmp_bytes = bytes;
1756
1757 src1 = adjust_address (orig_src1, load_mode, offset);
1758 src2 = adjust_address (orig_src2, load_mode, offset);
1759
1760 if (!REG_P (XEXP (src1, 0)))
1761 {
1762 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1763 src1 = replace_equiv_address (src1, src1_reg);
1764 }
f4f867ca 1765 set_mem_size (src1, load_mode_size);
8845cb37
AS
1766
1767 if (!REG_P (XEXP (src2, 0)))
1768 {
1769 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1770 src2 = replace_equiv_address (src2, src2_reg);
1771 }
f4f867ca 1772 set_mem_size (src2, load_mode_size);
8845cb37
AS
1773
1774 do_load_for_compare (tmp_reg_src1, src1, load_mode);
1775 do_load_for_compare (tmp_reg_src2, src2, load_mode);
1776
1777 if (cmp_bytes < load_mode_size)
1778 {
1779 /* Shift unneeded bytes off. */
1780 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1781 if (word_mode == DImode)
1782 {
1783 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1784 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1785 }
1786 else
1787 {
1788 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1789 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1790 }
1791 }
1792
1793 int remain = bytes - cmp_bytes;
37ae4739 1794 if (GET_MODE_SIZE (GET_MODE (final_result)) > GET_MODE_SIZE (load_mode))
8845cb37 1795 {
37ae4739 1796 /* Final_result is larger than load size so we don't need to
8845cb37
AS
1797 reduce result size. */
1798
1799 /* We previously did a block that need 64->32 conversion but
1800 the current block does not, so a label is needed to jump
1801 to the end. */
37ae4739 1802 if (need_6432_conv && !final_label)
8845cb37
AS
1803 final_label = gen_label_rtx ();
1804
1805 if (remain > 0)
1806 {
1807 /* This is not the last block, branch to the end if the result
1808 of this subtract is not zero. */
1809 if (!final_label)
1810 final_label = gen_label_rtx ();
1811 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1812 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1813 rtx cr = gen_reg_rtx (CCmode);
1814 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
37ae4739 1815 emit_insn (gen_movsi (final_result,
8845cb37
AS
1816 gen_lowpart (SImode, tmp_reg_src2)));
1817 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1818 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1819 fin_ref, pc_rtx);
1820 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1821 JUMP_LABEL (j) = final_label;
1822 LABEL_NUSES (final_label) += 1;
1823 }
1824 else
1825 {
1826 if (word_mode == DImode)
1827 {
1828 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1829 tmp_reg_src2));
37ae4739 1830 emit_insn (gen_movsi (final_result,
8845cb37
AS
1831 gen_lowpart (SImode, tmp_reg_src2)));
1832 }
1833 else
37ae4739 1834 emit_insn (gen_subsi3 (final_result, tmp_reg_src1, tmp_reg_src2));
8845cb37
AS
1835
1836 if (final_label)
1837 {
1838 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1839 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
5ec3397e 1840 JUMP_LABEL (j) = final_label;
8845cb37
AS
1841 LABEL_NUSES (final_label) += 1;
1842 emit_barrier ();
1843 }
1844 }
1845 }
1846 else
1847 {
1848 /* Do we need a 64->32 conversion block? We need the 64->32
37ae4739 1849 conversion even if final_result size == load_mode size because
8845cb37 1850 the subtract generates one extra bit. */
37ae4739 1851 need_6432_conv = true;
8845cb37
AS
1852
1853 if (remain > 0)
1854 {
1855 if (!convert_label)
1856 convert_label = gen_label_rtx ();
1857
1858 /* Compare to zero and branch to convert_label if not zero. */
1859 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1860 if (TARGET_P9_MISC)
1861 {
37ae4739
AS
1862 /* Generate a compare, and convert with a setb later.
1863 Use cond that is passed in because the caller needs
1864 to use it for the 64->32 conversion later. */
8845cb37
AS
1865 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1866 tmp_reg_src2);
1867 emit_insn (gen_rtx_SET (cond, cmp));
1868 }
1869 else
37ae4739
AS
1870 {
1871 /* Generate a subfc. and use the longer sequence for
1872 conversion. Cond is not used outside this
1873 function in this case. */
1874 cond = gen_reg_rtx (CCmode);
1875 if (TARGET_64BIT)
1876 emit_insn (gen_subfdi3_carry_dot2 (sub_result, tmp_reg_src2,
1877 tmp_reg_src1, cond));
1878 else
1879 emit_insn (gen_subfsi3_carry_dot2 (sub_result, tmp_reg_src2,
1880 tmp_reg_src1, cond));
1881 }
1882
8845cb37
AS
1883 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1884 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1885 cvt_ref, pc_rtx);
1886 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
5ec3397e 1887 JUMP_LABEL (j) = convert_label;
8845cb37
AS
1888 LABEL_NUSES (convert_label) += 1;
1889 }
1890 else
1891 {
1892 /* Just do the subtract/compare. Since this is the last block
1893 the convert code will be generated immediately following. */
1894 if (TARGET_P9_MISC)
1895 {
1896 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1897 tmp_reg_src2);
1898 emit_insn (gen_rtx_SET (cond, cmp));
1899 }
1900 else
1901 if (TARGET_64BIT)
37ae4739 1902 emit_insn (gen_subfdi3_carry (sub_result, tmp_reg_src2,
8845cb37
AS
1903 tmp_reg_src1));
1904 else
37ae4739 1905 emit_insn (gen_subfsi3_carry (sub_result, tmp_reg_src2,
8845cb37
AS
1906 tmp_reg_src1));
1907 }
1908 }
1909
1910 offset += cmp_bytes;
1911 bytes -= cmp_bytes;
1912 }
1913
37ae4739
AS
1914 if (convert_label)
1915 *p_convert_label = convert_label;
1916 if (final_label)
1917 *p_final_label = final_label;
1918 return need_6432_conv;
1919}
1920
1921/* Expand a block compare operation, and return true if successful.
1922 Return false if we should let the compiler generate normal code,
1923 probably a memcmp call.
1924
1925 OPERANDS[0] is the target (result).
1926 OPERANDS[1] is the first source.
1927 OPERANDS[2] is the second source.
1928 OPERANDS[3] is the length.
1929 OPERANDS[4] is the alignment. */
1930bool
1931expand_block_compare (rtx operands[])
1932{
1933 rtx target = operands[0];
1934 rtx orig_src1 = operands[1];
1935 rtx orig_src2 = operands[2];
1936 rtx bytes_rtx = operands[3];
1937 rtx align_rtx = operands[4];
1938
1939 /* This case is complicated to handle because the subtract
1940 with carry instructions do not generate the 64-bit
1941 carry and so we must emit code to calculate it ourselves.
1942 We choose not to implement this yet. */
1943 if (TARGET_32BIT && TARGET_POWERPC64)
1944 return false;
1945
1946 bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
1947
1948 /* Allow this param to shut off all expansion. */
1949 if (rs6000_block_compare_inline_limit == 0)
1950 return false;
1951
1952 /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1953 However slow_unaligned_access returns true on P7 even though the
1954 performance of this code is good there. */
1955 if (!isP7
1956 && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1957 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
1958 return false;
1959
1960 /* Unaligned l*brx traps on P7 so don't do this. However this should
1961 not affect much because LE isn't really supported on P7 anyway. */
1962 if (isP7 && !BYTES_BIG_ENDIAN)
1963 return false;
1964
1965 /* If this is not a fixed size compare, try generating loop code and
1966 if that fails just call memcmp. */
1967 if (!CONST_INT_P (bytes_rtx))
1968 return expand_compare_loop (operands);
1969
1970 /* This must be a fixed size alignment. */
1971 if (!CONST_INT_P (align_rtx))
1972 return false;
1973
1974 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
1975
1976 gcc_assert (GET_MODE (target) == SImode);
1977
1978 /* Anything to move? */
1979 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1980 if (bytes == 0)
1981 return true;
1982
1983 /* P7/P8 code uses cond for subfc. but P9 uses
1984 it for cmpld which needs CCUNSmode. */
1985 rtx cond = NULL;
1986 if (TARGET_P9_MISC)
1987 cond = gen_reg_rtx (CCUNSmode);
1988
1989 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
1990 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
1991 at least POWER8. That way we can rely on overlapping compares to
1992 do the final comparison of less than 16 bytes. Also I do not
1993 want to deal with making this work for 32 bits. In addition, we
1994 have to make sure that we have at least P8_VECTOR (we don't allow
1995 P9_VECTOR without P8_VECTOR). */
1996 int use_vec = (bytes >= 33 && !TARGET_32BIT
1997 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
1998
1999 /* We don't want to generate too much code. The loop code can take
2000 over for lengths greater than 31 bytes. */
2001 unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
2002
2003 /* Don't generate too much code if vsx was disabled. */
2004 if (!use_vec && max_bytes > 1)
2005 max_bytes = ((max_bytes + 1) / 2) - 1;
2006
2007 if (!IN_RANGE (bytes, 1, max_bytes))
2008 return expand_compare_loop (operands);
2009
2010 /* The code generated for p7 and older is not faster than glibc
2011 memcmp if alignment is small and length is not short, so bail
2012 out to avoid those conditions. */
2013 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
2014 && ((base_align == 1 && bytes > 16)
2015 || (base_align == 2 && bytes > 32)))
2016 return false;
2017
2018 rtx final_label = NULL;
2019
2020 if (use_vec)
8845cb37 2021 {
37ae4739
AS
2022 rtx final_move_label = gen_label_rtx ();
2023 rtx s1addr = gen_reg_rtx (Pmode);
2024 rtx s2addr = gen_reg_rtx (Pmode);
2025 rtx off_reg = gen_reg_rtx (Pmode);
2026 rtx cleanup_label = NULL;
2027 rtx vec_result = gen_reg_rtx (V16QImode);
2028 rtx s1data = gen_reg_rtx (V16QImode);
2029 rtx s2data = gen_reg_rtx (V16QImode);
2030 rtx result_reg = gen_reg_rtx (word_mode);
2031 emit_move_insn (result_reg, GEN_INT (0));
8845cb37 2032
37ae4739
AS
2033 expand_cmp_vec_sequence (bytes, orig_src1, orig_src2,
2034 s1addr, s2addr, off_reg, s1data, s2data,
2035 vec_result, false,
2036 &cleanup_label, final_move_label, false);
2037
2038 if (cleanup_label)
2039 emit_label (cleanup_label);
2040
2041 emit_insn (gen_one_cmplv16qi2 (vec_result, vec_result));
2042
2043 emit_final_compare_vec (s1data, s2data, result_reg,
2044 s1addr, s2addr, orig_src1, orig_src2,
2045 off_reg, vec_result);
2046
2047 emit_label (final_move_label);
2048 emit_insn (gen_movsi (target,
2049 gen_lowpart (SImode, result_reg)));
2050 }
2051 else
2052 { /* generate GPR code */
2053
2054 rtx convert_label = NULL;
2055 rtx sub_result = gen_reg_rtx (word_mode);
2056 bool need_6432_conversion =
2057 expand_block_compare_gpr(bytes, base_align,
2058 orig_src1, orig_src2,
2059 sub_result, cond, target,
2060 &convert_label, &final_label);
2061
2062 if (need_6432_conversion)
8845cb37 2063 {
37ae4739
AS
2064 if (convert_label)
2065 emit_label (convert_label);
2066 if (TARGET_P9_MISC)
2067 emit_insn (gen_setb_unsigned (target, cond));
8845cb37 2068 else
37ae4739 2069 generate_6432_conversion(target, sub_result);
8845cb37
AS
2070 }
2071 }
2072
2073 if (final_label)
2074 emit_label (final_label);
2075
8845cb37
AS
2076 return true;
2077}
2078
f7e94dfb 2079/* Generate page crossing check and branch code to set up for
8845cb37
AS
2080 strncmp when we don't have DI alignment.
2081 STRNCMP_LABEL is the label to branch if there is a page crossing.
f7e94dfb 2082 SRC_ADDR is the string address to be examined.
8845cb37
AS
2083 BYTES is the max number of bytes to compare. */
2084static void
f7e94dfb 2085expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes)
8845cb37
AS
2086{
2087 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
f7e94dfb
AS
2088 rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr));
2089 do_and3 (src_pgoff, src_addr, GEN_INT (0xfff));
8845cb37 2090 rtx cond = gen_reg_rtx (CCmode);
f7e94dfb 2091 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff,
8845cb37
AS
2092 GEN_INT (4096 - bytes)));
2093
0c791c59 2094 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
8845cb37
AS
2095
2096 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
0c791c59 2097 lab_ref, pc_rtx);
8845cb37
AS
2098 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2099 JUMP_LABEL (j) = strncmp_label;
2100 LABEL_NUSES (strncmp_label) += 1;
2101}
2102
74f9986e
AS
2103/* Generate the sequence of compares for strcmp/strncmp using gpr instructions.
2104 BYTES_TO_COMPARE is the number of bytes to be compared.
2105 BASE_ALIGN is the smaller of the alignment of the two strings.
2106 ORIG_SRC1 is the unmodified rtx for the first string.
2107 ORIG_SRC2 is the unmodified rtx for the second string.
2108 TMP_REG_SRC1 is the register for loading the first string.
2109 TMP_REG_SRC2 is the register for loading the second string.
2110 RESULT_REG is the rtx for the result register.
2111 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
2112 to strcmp/strncmp if we have equality at the end of the inline comparison.
9d36bd3b
AS
2113 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
2114 to clean up and generate the final comparison result.
ef4adf1f 2115 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
74f9986e
AS
2116 set the final result. */
2117static void
9d36bd3b
AS
2118expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
2119 unsigned int base_align,
2120 rtx orig_src1, rtx orig_src2,
2121 rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
2122 bool equality_compare_rest, rtx *p_cleanup_label,
2123 rtx final_move_label)
74f9986e
AS
2124{
2125 unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
2126 machine_mode load_mode;
2127 unsigned int load_mode_size;
2128 unsigned HOST_WIDE_INT cmp_bytes = 0;
2129 unsigned HOST_WIDE_INT offset = 0;
2130 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2131 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
9d36bd3b
AS
2132 gcc_assert (p_cleanup_label != NULL);
2133 rtx cleanup_label = *p_cleanup_label;
74f9986e
AS
2134
2135 while (bytes_to_compare > 0)
2136 {
2137 /* GPR compare sequence:
ef4adf1f
AS
2138 check each 8B with: ld/ld/cmpb/cmpb/orc./bne
2139
74f9986e 2140 cleanup code at end:
74f9986e
AS
2141 cntlzd get bit of first zero/diff byte
2142 subfic convert for rldcl use
2143 rldcl rldcl extract diff/zero byte
2144 subf subtract for final result
2145
2146 The last compare can branch around the cleanup code if the
2147 result is zero because the strings are exactly equal. */
ef4adf1f 2148
74f9986e
AS
2149 unsigned int align = compute_current_alignment (base_align, offset);
2150 load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
2151 load_mode_size = GET_MODE_SIZE (load_mode);
2152 if (bytes_to_compare >= load_mode_size)
2153 cmp_bytes = load_mode_size;
2154 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
2155 {
2156 /* Move this load back so it doesn't go past the end.
2157 P8/P9 can do this efficiently. */
2158 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
2159 cmp_bytes = bytes_to_compare;
2160 if (extra_bytes < offset)
2161 {
2162 offset -= extra_bytes;
2163 cmp_bytes = load_mode_size;
2164 bytes_to_compare = cmp_bytes;
2165 }
2166 }
2167 else
2168 /* P7 and earlier can't do the overlapping load trick fast,
2169 so this forces a non-overlapping load and a shift to get
2170 rid of the extra bytes. */
2171 cmp_bytes = bytes_to_compare;
2172
122d6c36
AS
2173 rtx offset_rtx;
2174 if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM)
2175 offset_rtx = GEN_INT (offset);
2176 else
2177 {
2178 offset_rtx = gen_reg_rtx (Pmode);
2179 emit_move_insn (offset_rtx, GEN_INT (offset));
2180 }
2181 rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx);
2182 rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx);
37ae4739 2183
74f9986e 2184 do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
74f9986e
AS
2185 do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
2186
2187 /* We must always left-align the data we read, and
2188 clear any bytes to the right that are beyond the string.
2189 Otherwise the cmpb sequence won't produce the correct
ef4adf1f
AS
2190 results. However if there is only one byte left, we
2191 can just subtract to get the final result so the shifts
2192 and clears are not needed. */
74f9986e 2193
ef4adf1f 2194 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
74f9986e 2195
ef4adf1f
AS
2196 /* Loading just a single byte is a special case. If we are
2197 loading more than that, we have to check whether we are
2198 looking at the entire chunk of data. If not, rotate left and
2199 clear right so that bytes we aren't supposed to look at are
2200 zeroed, and the first byte we are supposed to compare is
2201 leftmost. */
2202 if (load_mode_size != 1)
74f9986e 2203 {
ef4adf1f
AS
2204 if (load_mode_size < word_mode_size)
2205 {
2206 /* Rotate left first. */
2207 rtx sh = GEN_INT (BITS_PER_UNIT
2208 * (word_mode_size - load_mode_size));
2209 do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
2210 do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
2211 }
2212
2213 if (cmp_bytes < word_mode_size)
2214 {
2215 /* Now clear right. This plus the rotate can be
2216 turned into a rldicr instruction. */
2217 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2218 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2219 do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
2220 do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
2221 }
74f9986e
AS
2222 }
2223
2224 /* Cases to handle. A and B are chunks of the two strings.
2225 1: Not end of comparison:
2226 A != B: branch to cleanup code to compute result.
2227 A == B: check for 0 byte, next block if not found.
2228 2: End of the inline comparison:
2229 A != B: branch to cleanup code to compute result.
2230 A == B: check for 0 byte, call strcmp/strncmp
2231 3: compared requested N bytes:
2232 A == B: branch to result 0.
2233 A != B: cleanup code to compute result. */
2234
74f9986e
AS
2235 rtx dst_label;
2236 if (remain > 0 || equality_compare_rest)
2237 {
2238 /* Branch to cleanup code, otherwise fall through to do
2239 more compares. */
2240 if (!cleanup_label)
2241 cleanup_label = gen_label_rtx ();
2242 dst_label = cleanup_label;
2243 }
2244 else
2245 /* Branch to end and produce result of 0. */
2246 dst_label = final_move_label;
2247
ef4adf1f
AS
2248 if (load_mode_size == 1)
2249 {
2250 /* Special case for comparing just single byte. */
2251 if (equality_compare_rest)
2252 {
2253 /* Use subf./bne to branch to final_move_label if the
2254 byte differs, otherwise fall through to the strncmp
2255 call. We must also check for a zero byte here as we
2256 must not make the library call if this is the end of
2257 the string. */
2258
2259 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2260 rtx cond = gen_reg_rtx (CCmode);
2261 rtx diff_rtx = gen_rtx_MINUS (word_mode,
2262 tmp_reg_src1, tmp_reg_src2);
2263 rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond);
2264 rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2265
2266 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2267 lab_ref, pc_rtx);
2268 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2269 JUMP_LABEL (j) = final_move_label;
2270 LABEL_NUSES (final_move_label) += 1;
74f9986e 2271
ef4adf1f
AS
2272 /* Check for zero byte here before fall through to
2273 library call. This catches the case where the
2274 strings are equal and end in a zero byte at this
2275 position. */
74f9986e 2276
ef4adf1f
AS
2277 rtx cond0 = gen_reg_rtx (CCmode);
2278 emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1,
2279 const0_rtx));
74f9986e 2280
ef4adf1f 2281 rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx);
74f9986e 2282
ef4adf1f
AS
2283 rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx,
2284 lab_ref, pc_rtx);
2285 rtx j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0));
2286 JUMP_LABEL (j0) = final_move_label;
2287 LABEL_NUSES (final_move_label) += 1;
2288 }
2289 else
2290 {
2291 /* This is the last byte to be compared so we can use
2292 subf to compute the final result and branch
2293 unconditionally to final_move_label. */
2294
2295 do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2);
2296
2297 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2298 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2299 JUMP_LABEL (j) = final_move_label;
2300 LABEL_NUSES (final_move_label) += 1;
2301 emit_barrier ();
2302 }
2303 }
2304 else
74f9986e 2305 {
74f9986e 2306 rtx cmpb_zero = gen_reg_rtx (word_mode);
ef4adf1f 2307 rtx cmpb_diff = gen_reg_rtx (word_mode);
74f9986e 2308 rtx zero_reg = gen_reg_rtx (word_mode);
ef4adf1f
AS
2309 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
2310 rtx cond = gen_reg_rtx (CCmode);
2311
74f9986e 2312 emit_move_insn (zero_reg, GEN_INT (0));
ef4adf1f 2313 do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2);
74f9986e 2314 do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
ef4adf1f
AS
2315 rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff);
2316 rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero);
74f9986e 2317
ef4adf1f 2318 rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond);
74f9986e 2319
ef4adf1f
AS
2320 rtx cmp_rtx;
2321 if (remain == 0 && !equality_compare_rest)
2322 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
2323 else
2324 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
74f9986e 2325
ef4adf1f
AS
2326 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2327 lab_ref, pc_rtx);
2328 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2329 JUMP_LABEL (j) = dst_label;
2330 LABEL_NUSES (dst_label) += 1;
74f9986e
AS
2331 }
2332
2333 offset += cmp_bytes;
2334 bytes_to_compare -= cmp_bytes;
2335 }
2336
9d36bd3b
AS
2337 *p_cleanup_label = cleanup_label;
2338 return;
2339}
2340
f7e94dfb
AS
2341/* Generate the final sequence that identifies the differing
2342 byte and generates the final result, taking into account
2343 zero bytes:
ef4adf1f 2344
f7e94dfb
AS
2345 cntlzd get bit of first zero/diff byte
2346 addi convert for rldcl use
2347 rldcl rldcl extract diff/zero byte
2348 subf subtract for final result
2349
2350 STR1 is the reg rtx for data from string 1.
2351 STR2 is the reg rtx for data from string 2.
2352 RESULT is the reg rtx for the comparison result. */
2353
2354static void
2355emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
2356{
2357 machine_mode m = GET_MODE (str1);
f7e94dfb 2358 rtx rot_amt = gen_reg_rtx (m);
f7e94dfb
AS
2359
2360 rtx rot1_1 = gen_reg_rtx (m);
2361 rtx rot1_2 = gen_reg_rtx (m);
2362 rtx rot2_1 = gen_reg_rtx (m);
2363 rtx rot2_2 = gen_reg_rtx (m);
2364
2365 if (m == SImode)
2366 {
ef4adf1f 2367 emit_insn (gen_clzsi2 (rot_amt, result));
f7e94dfb
AS
2368 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
2369 emit_insn (gen_rotlsi3 (rot1_1, str1,
2370 gen_lowpart (SImode, rot_amt)));
2371 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2372 emit_insn (gen_rotlsi3 (rot2_1, str2,
2373 gen_lowpart (SImode, rot_amt)));
2374 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2375 emit_insn (gen_subsi3 (result, rot1_2, rot2_2));
2376 }
2377 else if (m == DImode)
2378 {
ef4adf1f 2379 emit_insn (gen_clzdi2 (rot_amt, result));
f7e94dfb
AS
2380 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
2381 emit_insn (gen_rotldi3 (rot1_1, str1,
2382 gen_lowpart (SImode, rot_amt)));
2383 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2384 emit_insn (gen_rotldi3 (rot2_1, str2,
2385 gen_lowpart (SImode, rot_amt)));
2386 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2387 emit_insn (gen_subdi3 (result, rot1_2, rot2_2));
2388 }
2389 else
2390 gcc_unreachable ();
ef4adf1f 2391
f7e94dfb
AS
2392 return;
2393}
2394
8845cb37 2395/* Expand a string compare operation with length, and return
ef4adf1f 2396 true if successful. Return false if we should let the
8845cb37
AS
2397 compiler generate normal code, probably a strncmp call.
2398
2399 OPERANDS[0] is the target (result).
2400 OPERANDS[1] is the first source.
2401 OPERANDS[2] is the second source.
2402 If NO_LENGTH is zero, then:
2403 OPERANDS[3] is the length.
2404 OPERANDS[4] is the alignment in bytes.
2405 If NO_LENGTH is nonzero, then:
2406 OPERANDS[3] is the alignment in bytes. */
2407bool
2408expand_strn_compare (rtx operands[], int no_length)
2409{
2410 rtx target = operands[0];
2411 rtx orig_src1 = operands[1];
2412 rtx orig_src2 = operands[2];
2413 rtx bytes_rtx, align_rtx;
2414 if (no_length)
2415 {
2416 bytes_rtx = NULL;
2417 align_rtx = operands[3];
2418 }
2419 else
2420 {
2421 bytes_rtx = operands[3];
2422 align_rtx = operands[4];
2423 }
74f9986e 2424
f7e94dfb
AS
2425 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2426 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
8845cb37 2427
ef4adf1f 2428 /* If we have a length, it must be constant. This simplifies things
8845cb37 2429 a bit as we don't have to generate code to check if we've exceeded
ef4adf1f 2430 the length. Later this could be expanded to handle this case. */
8845cb37
AS
2431 if (!no_length && !CONST_INT_P (bytes_rtx))
2432 return false;
2433
2434 /* This must be a fixed size alignment. */
2435 if (!CONST_INT_P (align_rtx))
2436 return false;
2437
2438 unsigned int base_align = UINTVAL (align_rtx);
f7e94dfb
AS
2439 unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
2440 unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
8845cb37 2441
e0bd6c9f
RS
2442 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */
2443 if (targetm.slow_unaligned_access (word_mode, align1)
2444 || targetm.slow_unaligned_access (word_mode, align2))
8845cb37
AS
2445 return false;
2446
2447 gcc_assert (GET_MODE (target) == SImode);
2448
9d36bd3b 2449 unsigned int required_align = 8;
8845cb37
AS
2450
2451 unsigned HOST_WIDE_INT offset = 0;
2452 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
2453 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
9d36bd3b 2454
8845cb37 2455 if (no_length)
9d36bd3b 2456 bytes = rs6000_string_compare_inline_limit;
8845cb37
AS
2457 else
2458 bytes = UINTVAL (bytes_rtx);
2459
ef4adf1f 2460 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at
9d36bd3b
AS
2461 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2462 at least POWER8. That way we can rely on overlapping compares to
6bd2b8ec
AS
2463 do the final comparison of less than 16 bytes. Also I do not
2464 want to deal with making this work for 32 bits. In addition, we
2465 have to make sure that we have at least P8_VECTOR (we don't allow
2466 P9_VECTOR without P8_VECTOR). */
2467 int use_vec = (bytes >= 16 && !TARGET_32BIT
2468 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
9d36bd3b
AS
2469
2470 if (use_vec)
2471 required_align = 16;
2472
2473 machine_mode load_mode;
2474 rtx tmp_reg_src1, tmp_reg_src2;
2475 if (use_vec)
2476 {
2477 load_mode = V16QImode;
2478 tmp_reg_src1 = gen_reg_rtx (V16QImode);
2479 tmp_reg_src2 = gen_reg_rtx (V16QImode);
2480 }
2481 else
2482 {
2483 load_mode = select_block_compare_mode (0, bytes, base_align);
2484 tmp_reg_src1 = gen_reg_rtx (word_mode);
2485 tmp_reg_src2 = gen_reg_rtx (word_mode);
2486 }
2487
2488 compare_length = rs6000_string_compare_inline_limit;
8845cb37
AS
2489
2490 /* If we have equality at the end of the last compare and we have not
2491 found the end of the string, we need to call strcmp/strncmp to
2492 compare the remainder. */
2493 bool equality_compare_rest = false;
2494
2495 if (no_length)
2496 {
2497 bytes = compare_length;
2498 equality_compare_rest = true;
2499 }
2500 else
2501 {
2502 if (bytes <= compare_length)
2503 compare_length = bytes;
2504 else
2505 equality_compare_rest = true;
2506 }
2507
2508 rtx result_reg = gen_reg_rtx (word_mode);
2509 rtx final_move_label = gen_label_rtx ();
2510 rtx final_label = gen_label_rtx ();
2511 rtx begin_compare_label = NULL;
ef4adf1f 2512
f7e94dfb 2513 if (base_align < required_align)
8845cb37
AS
2514 {
2515 /* Generate code that checks distance to 4k boundary for this case. */
2516 begin_compare_label = gen_label_rtx ();
2517 rtx strncmp_label = gen_label_rtx ();
2518 rtx jmp;
2519
2520 /* Strncmp for power8 in glibc does this:
5ec3397e
AS
2521 rldicl r8,r3,0,52
2522 cmpldi cr7,r8,4096-16
2523 bgt cr7,L(pagecross) */
8845cb37
AS
2524
2525 /* Make sure that the length we use for the alignment test and
2526 the subsequent code generation are in agreement so we do not
2527 go past the length we tested for a 4k boundary crossing. */
2528 unsigned HOST_WIDE_INT align_test = compare_length;
9d36bd3b 2529 if (align_test < required_align)
8845cb37
AS
2530 {
2531 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
2532 base_align = align_test;
2533 }
2534 else
2535 {
f7e94dfb
AS
2536 align_test = ROUND_UP (align_test, required_align);
2537 base_align = required_align;
8845cb37
AS
2538 }
2539
f7e94dfb
AS
2540 if (align1 < required_align)
2541 expand_strncmp_align_check (strncmp_label, src1_addr, align_test);
2542 if (align2 < required_align)
2543 expand_strncmp_align_check (strncmp_label, src2_addr, align_test);
8845cb37
AS
2544
2545 /* Now generate the following sequence:
2546 - branch to begin_compare
2547 - strncmp_label
2548 - call to strncmp
2549 - branch to final_label
2550 - begin_compare_label */
2551
2552 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
2553 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
2554 JUMP_LABEL (jmp) = begin_compare_label;
2555 LABEL_NUSES (begin_compare_label) += 1;
2556 emit_barrier ();
2557
2558 emit_label (strncmp_label);
2559
8845cb37
AS
2560 if (no_length)
2561 {
2562 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2563 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2564 target, LCT_NORMAL, GET_MODE (target),
f7e94dfb
AS
2565 force_reg (Pmode, src1_addr), Pmode,
2566 force_reg (Pmode, src2_addr), Pmode);
8845cb37
AS
2567 }
2568 else
2569 {
2570 /* -m32 -mpowerpc64 results in word_mode being DImode even
9d36bd3b 2571 though otherwise it is 32-bit. The length arg to strncmp
8845cb37 2572 is a size_t which will be the same size as pointers. */
e9727bda
AS
2573 rtx len_rtx = gen_reg_rtx (Pmode);
2574 emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
8845cb37
AS
2575
2576 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2577 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2578 target, LCT_NORMAL, GET_MODE (target),
f7e94dfb
AS
2579 force_reg (Pmode, src1_addr), Pmode,
2580 force_reg (Pmode, src2_addr), Pmode,
e9727bda 2581 len_rtx, Pmode);
8845cb37
AS
2582 }
2583
2584 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2585 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2586 JUMP_LABEL (jmp) = final_label;
2587 LABEL_NUSES (final_label) += 1;
2588 emit_barrier ();
2589 emit_label (begin_compare_label);
2590 }
2591
2592 rtx cleanup_label = NULL;
9d36bd3b 2593 rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL;
8845cb37 2594
f7e94dfb 2595 /* Generate a sequence of GPR or VEC/VSX instructions to compare out
8845cb37 2596 to the length specified. */
9d36bd3b
AS
2597 if (use_vec)
2598 {
2599 s1addr = gen_reg_rtx (Pmode);
2600 s2addr = gen_reg_rtx (Pmode);
2601 off_reg = gen_reg_rtx (Pmode);
2602 vec_result = gen_reg_rtx (load_mode);
2603 emit_move_insn (result_reg, GEN_INT (0));
37ae4739
AS
2604 expand_cmp_vec_sequence (compare_length,
2605 orig_src1, orig_src2,
2606 s1addr, s2addr, off_reg,
2607 tmp_reg_src1, tmp_reg_src2,
2608 vec_result,
2609 equality_compare_rest,
2610 &cleanup_label, final_move_label, true);
9d36bd3b
AS
2611 }
2612 else
2613 expand_strncmp_gpr_sequence (compare_length, base_align,
2614 orig_src1, orig_src2,
2615 tmp_reg_src1, tmp_reg_src2,
2616 result_reg,
2617 equality_compare_rest,
2618 &cleanup_label, final_move_label);
74f9986e
AS
2619
2620 offset = compare_length;
ef4adf1f 2621
8845cb37
AS
2622 if (equality_compare_rest)
2623 {
2624 /* Update pointers past what has been compared already. */
f7e94dfb
AS
2625 rtx src1 = force_reg (Pmode,
2626 gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset)));
2627 rtx src2 = force_reg (Pmode,
2628 gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset)));
8845cb37
AS
2629
2630 /* Construct call to strcmp/strncmp to compare the rest of the string. */
2631 if (no_length)
2632 {
2633 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2634 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2635 target, LCT_NORMAL, GET_MODE (target),
f7e94dfb 2636 src1, Pmode, src2, Pmode);
8845cb37
AS
2637 }
2638 else
2639 {
e9727bda
AS
2640 rtx len_rtx = gen_reg_rtx (Pmode);
2641 emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode));
8845cb37
AS
2642 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2643 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
db69559b 2644 target, LCT_NORMAL, GET_MODE (target),
e9727bda 2645 src1, Pmode, src2, Pmode, len_rtx, Pmode);
8845cb37
AS
2646 }
2647
2648 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2649 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2650 JUMP_LABEL (jmp) = final_label;
2651 LABEL_NUSES (final_label) += 1;
2652 emit_barrier ();
2653 }
2654
2655 if (cleanup_label)
2656 emit_label (cleanup_label);
2657
9d36bd3b 2658 if (use_vec)
37ae4739
AS
2659 emit_final_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg,
2660 s1addr, s2addr, orig_src1, orig_src2,
2661 off_reg, vec_result);
9d36bd3b
AS
2662 else
2663 emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
8845cb37
AS
2664
2665 emit_label (final_move_label);
2666 emit_insn (gen_movsi (target,
2667 gen_lowpart (SImode, result_reg)));
2668 emit_label (final_label);
2669 return true;
2670}
2671
19db0ebb
AS
2672/* Generate loads and stores for a move of v4si mode using lvx/stvx.
2673 This uses altivec_{l,st}vx_<mode>_internal which use unspecs to
2674 keep combine from changing what instruction gets used.
2675
2676 DEST is the destination for the data.
2677 SRC is the source of the data for the move. */
2678
2679static rtx
2680gen_lvx_v4si_move (rtx dest, rtx src)
2681{
2682 gcc_assert (MEM_P (dest) ^ MEM_P (src));
2683 gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
2684
2685 if (MEM_P (dest))
2686 return gen_altivec_stvx_v4si_internal (dest, src);
2687 else
2688 return gen_altivec_lvx_v4si_internal (dest, src);
2689}
2690
8845cb37
AS
2691/* Expand a block move operation, and return 1 if successful. Return 0
2692 if we should let the compiler generate normal code.
2693
2694 operands[0] is the destination
2695 operands[1] is the source
2696 operands[2] is the length
2697 operands[3] is the alignment */
2698
2699#define MAX_MOVE_REG 4
2700
2701int
2702expand_block_move (rtx operands[])
2703{
2704 rtx orig_dest = operands[0];
2705 rtx orig_src = operands[1];
2706 rtx bytes_rtx = operands[2];
2707 rtx align_rtx = operands[3];
2e42a52f 2708 int constp = CONST_INT_P (bytes_rtx);
8845cb37
AS
2709 int align;
2710 int bytes;
2711 int offset;
2712 int move_bytes;
2713 rtx stores[MAX_MOVE_REG];
2714 int num_reg = 0;
2715
2716 /* If this is not a fixed size move, just call memcpy */
2717 if (! constp)
2718 return 0;
2719
2720 /* This must be a fixed size alignment */
2e42a52f 2721 gcc_assert (CONST_INT_P (align_rtx));
8845cb37
AS
2722 align = INTVAL (align_rtx) * BITS_PER_UNIT;
2723
2724 /* Anything to move? */
2725 bytes = INTVAL (bytes_rtx);
2726 if (bytes <= 0)
2727 return 1;
2728
2729 if (bytes > rs6000_block_move_inline_limit)
2730 return 0;
2731
2732 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2733 {
2734 union {
2735 rtx (*movmemsi) (rtx, rtx, rtx, rtx);
2736 rtx (*mov) (rtx, rtx);
2737 } gen_func;
2738 machine_mode mode = BLKmode;
2739 rtx src, dest;
2740
2741 /* Altivec first, since it will be faster than a string move
2742 when it applies, and usually not significantly larger. */
19db0ebb 2743 if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
8845cb37
AS
2744 {
2745 move_bytes = 16;
2746 mode = V4SImode;
19db0ebb 2747 gen_func.mov = gen_lvx_v4si_move;
8845cb37 2748 }
8845cb37
AS
2749 else if (bytes >= 8 && TARGET_POWERPC64
2750 && (align >= 64 || !STRICT_ALIGNMENT))
2751 {
2752 move_bytes = 8;
2753 mode = DImode;
2754 gen_func.mov = gen_movdi;
2755 if (offset == 0 && align < 64)
2756 {
2757 rtx addr;
2758
2759 /* If the address form is reg+offset with offset not a
2760 multiple of four, reload into reg indirect form here
2761 rather than waiting for reload. This way we get one
2762 reload, not one per load and/or store. */
2763 addr = XEXP (orig_dest, 0);
2764 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2e42a52f 2765 && CONST_INT_P (XEXP (addr, 1))
8845cb37
AS
2766 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2767 {
2768 addr = copy_addr_to_reg (addr);
2769 orig_dest = replace_equiv_address (orig_dest, addr);
2770 }
2771 addr = XEXP (orig_src, 0);
2772 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2e42a52f 2773 && CONST_INT_P (XEXP (addr, 1))
8845cb37
AS
2774 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2775 {
2776 addr = copy_addr_to_reg (addr);
2777 orig_src = replace_equiv_address (orig_src, addr);
2778 }
2779 }
2780 }
8845cb37
AS
2781 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2782 { /* move 4 bytes */
2783 move_bytes = 4;
2784 mode = SImode;
2785 gen_func.mov = gen_movsi;
2786 }
2787 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2788 { /* move 2 bytes */
2789 move_bytes = 2;
2790 mode = HImode;
2791 gen_func.mov = gen_movhi;
2792 }
8845cb37
AS
2793 else /* move 1 byte at a time */
2794 {
2795 move_bytes = 1;
2796 mode = QImode;
2797 gen_func.mov = gen_movqi;
2798 }
2799
2800 src = adjust_address (orig_src, mode, offset);
2801 dest = adjust_address (orig_dest, mode, offset);
2802
2803 if (mode != BLKmode)
2804 {
2805 rtx tmp_reg = gen_reg_rtx (mode);
2806
2807 emit_insn ((*gen_func.mov) (tmp_reg, src));
2808 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2809 }
2810
2811 if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
2812 {
2813 int i;
2814 for (i = 0; i < num_reg; i++)
2815 emit_insn (stores[i]);
2816 num_reg = 0;
2817 }
2818
2819 if (mode == BLKmode)
2820 {
2821 /* Move the address into scratch registers. The movmemsi
2822 patterns require zero offset. */
2823 if (!REG_P (XEXP (src, 0)))
2824 {
2825 rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
2826 src = replace_equiv_address (src, src_reg);
2827 }
2828 set_mem_size (src, move_bytes);
2829
2830 if (!REG_P (XEXP (dest, 0)))
2831 {
2832 rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
2833 dest = replace_equiv_address (dest, dest_reg);
2834 }
2835 set_mem_size (dest, move_bytes);
2836
2837 emit_insn ((*gen_func.movmemsi) (dest, src,
2838 GEN_INT (move_bytes & 31),
2839 align_rtx));
2840 }
2841 }
2842
2843 return 1;
2844}