]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/config/rs6000/rs6000-string.c
[1/77] Add an E_ prefix to mode names
[thirdparty/gcc.git] / gcc / config / rs6000 / rs6000-string.c
CommitLineData
8845cb37
AS
1/* Subroutines used to expand string and block move, clear,
2 compare and other operations for PowerPC.
3 Copyright (C) 1991-2017 Free Software Foundation, Inc.
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "backend.h"
25#include "rtl.h"
26#include "tree.h"
27#include "memmodel.h"
28#include "tm_p.h"
29#include "ira.h"
30#include "print-tree.h"
31#include "varasm.h"
32#include "explow.h"
33#include "expr.h"
34#include "output.h"
35
36/* Expand a block clear operation, and return 1 if successful. Return 0
37 if we should let the compiler generate normal code.
38
39 operands[0] is the destination
40 operands[1] is the length
41 operands[3] is the alignment */
42
43int
44expand_block_clear (rtx operands[])
45{
46 rtx orig_dest = operands[0];
47 rtx bytes_rtx = operands[1];
48 rtx align_rtx = operands[3];
49 bool constp = (GET_CODE (bytes_rtx) == CONST_INT);
50 HOST_WIDE_INT align;
51 HOST_WIDE_INT bytes;
52 int offset;
53 int clear_bytes;
54 int clear_step;
55
56 /* If this is not a fixed size move, just call memcpy */
57 if (! constp)
58 return 0;
59
60 /* This must be a fixed size alignment */
61 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
62 align = INTVAL (align_rtx) * BITS_PER_UNIT;
63
64 /* Anything to clear? */
65 bytes = INTVAL (bytes_rtx);
66 if (bytes <= 0)
67 return 1;
68
69 /* Use the builtin memset after a point, to avoid huge code bloat.
70 When optimize_size, avoid any significant code bloat; calling
71 memset is about 4 instructions, so allow for one instruction to
72 load zero and three to do clearing. */
73 if (TARGET_ALTIVEC && align >= 128)
74 clear_step = 16;
75 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
76 clear_step = 8;
77 else
78 clear_step = 4;
79
80 if (optimize_size && bytes > 3 * clear_step)
81 return 0;
82 if (! optimize_size && bytes > 8 * clear_step)
83 return 0;
84
85 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
86 {
87 machine_mode mode = BLKmode;
88 rtx dest;
89
90 if (bytes >= 16 && TARGET_ALTIVEC && align >= 128)
91 {
92 clear_bytes = 16;
93 mode = V4SImode;
94 }
95 else if (bytes >= 8 && TARGET_POWERPC64
96 && (align >= 64 || !STRICT_ALIGNMENT))
97 {
98 clear_bytes = 8;
99 mode = DImode;
100 if (offset == 0 && align < 64)
101 {
102 rtx addr;
103
104 /* If the address form is reg+offset with offset not a
105 multiple of four, reload into reg indirect form here
106 rather than waiting for reload. This way we get one
107 reload, not one per store. */
108 addr = XEXP (orig_dest, 0);
109 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
110 && GET_CODE (XEXP (addr, 1)) == CONST_INT
111 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
112 {
113 addr = copy_addr_to_reg (addr);
114 orig_dest = replace_equiv_address (orig_dest, addr);
115 }
116 }
117 }
118 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
119 { /* move 4 bytes */
120 clear_bytes = 4;
121 mode = SImode;
122 }
123 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
124 { /* move 2 bytes */
125 clear_bytes = 2;
126 mode = HImode;
127 }
128 else /* move 1 byte at a time */
129 {
130 clear_bytes = 1;
131 mode = QImode;
132 }
133
134 dest = adjust_address (orig_dest, mode, offset);
135
136 emit_move_insn (dest, CONST0_RTX (mode));
137 }
138
139 return 1;
140}
141
142/* Figure out the correct instructions to generate to load data for
143 block compare. MODE is used for the read from memory, and
144 data is zero extended if REG is wider than MODE. If LE code
145 is being generated, bswap loads are used.
146
147 REG is the destination register to move the data into.
148 MEM is the memory block being read.
149 MODE is the mode of memory to use for the read. */
150static void
151do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
152{
153 switch (GET_MODE (reg))
154 {
155 case DImode:
156 switch (mode)
157 {
158 case QImode:
159 emit_insn (gen_zero_extendqidi2 (reg, mem));
160 break;
161 case HImode:
162 {
163 rtx src = mem;
164 if (!BYTES_BIG_ENDIAN)
165 {
166 src = gen_reg_rtx (HImode);
167 emit_insn (gen_bswaphi2 (src, mem));
168 }
169 emit_insn (gen_zero_extendhidi2 (reg, src));
170 break;
171 }
172 case SImode:
173 {
174 rtx src = mem;
175 if (!BYTES_BIG_ENDIAN)
176 {
177 src = gen_reg_rtx (SImode);
178 emit_insn (gen_bswapsi2 (src, mem));
179 }
180 emit_insn (gen_zero_extendsidi2 (reg, src));
181 }
182 break;
183 case DImode:
184 if (!BYTES_BIG_ENDIAN)
185 emit_insn (gen_bswapdi2 (reg, mem));
186 else
187 emit_insn (gen_movdi (reg, mem));
188 break;
189 default:
190 gcc_unreachable ();
191 }
192 break;
193
194 case SImode:
195 switch (mode)
196 {
197 case QImode:
198 emit_insn (gen_zero_extendqisi2 (reg, mem));
199 break;
200 case HImode:
201 {
202 rtx src = mem;
203 if (!BYTES_BIG_ENDIAN)
204 {
205 src = gen_reg_rtx (HImode);
206 emit_insn (gen_bswaphi2 (src, mem));
207 }
208 emit_insn (gen_zero_extendhisi2 (reg, src));
209 break;
210 }
211 case SImode:
212 if (!BYTES_BIG_ENDIAN)
213 emit_insn (gen_bswapsi2 (reg, mem));
214 else
215 emit_insn (gen_movsi (reg, mem));
216 break;
217 case DImode:
218 /* DImode is larger than the destination reg so is not expected. */
219 gcc_unreachable ();
220 break;
221 default:
222 gcc_unreachable ();
223 }
224 break;
225 default:
226 gcc_unreachable ();
227 break;
228 }
229}
230
231/* Select the mode to be used for reading the next chunk of bytes
232 in the compare.
233
234 OFFSET is the current read offset from the beginning of the block.
235 BYTES is the number of bytes remaining to be read.
236 ALIGN is the minimum alignment of the memory blocks being compared in bytes.
237 WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is
238 the largest allowable mode. */
239static machine_mode
240select_block_compare_mode (unsigned HOST_WIDE_INT offset,
241 unsigned HOST_WIDE_INT bytes,
242 unsigned HOST_WIDE_INT align, bool word_mode_ok)
243{
244 /* First see if we can do a whole load unit
245 as that will be more efficient than a larger load + shift. */
246
247 /* If big, use biggest chunk.
248 If exactly chunk size, use that size.
249 If remainder can be done in one piece with shifting, do that.
250 Do largest chunk possible without violating alignment rules. */
251
252 /* The most we can read without potential page crossing. */
253 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
254
255 if (word_mode_ok && bytes >= UNITS_PER_WORD)
256 return word_mode;
257 else if (bytes == GET_MODE_SIZE (SImode))
258 return SImode;
259 else if (bytes == GET_MODE_SIZE (HImode))
260 return HImode;
261 else if (bytes == GET_MODE_SIZE (QImode))
262 return QImode;
263 else if (bytes < GET_MODE_SIZE (SImode)
264 && offset >= GET_MODE_SIZE (SImode) - bytes)
265 /* This matches the case were we have SImode and 3 bytes
266 and offset >= 1 and permits us to move back one and overlap
267 with the previous read, thus avoiding having to shift
268 unwanted bytes off of the input. */
269 return SImode;
270 else if (word_mode_ok && bytes < UNITS_PER_WORD
271 && offset >= UNITS_PER_WORD-bytes)
272 /* Similarly, if we can use DImode it will get matched here and
273 can do an overlapping read that ends at the end of the block. */
274 return word_mode;
275 else if (word_mode_ok && maxread >= UNITS_PER_WORD)
276 /* It is safe to do all remaining in one load of largest size,
277 possibly with a shift to get rid of unwanted bytes. */
278 return word_mode;
279 else if (maxread >= GET_MODE_SIZE (SImode))
280 /* It is safe to do all remaining in one SImode load,
281 possibly with a shift to get rid of unwanted bytes. */
282 return SImode;
283 else if (bytes > GET_MODE_SIZE (SImode))
284 return SImode;
285 else if (bytes > GET_MODE_SIZE (HImode))
286 return HImode;
287
288 /* final fallback is do one byte */
289 return QImode;
290}
291
292/* Compute the alignment of pointer+OFFSET where the original alignment
293 of pointer was BASE_ALIGN. */
294static unsigned HOST_WIDE_INT
295compute_current_alignment (unsigned HOST_WIDE_INT base_align,
296 unsigned HOST_WIDE_INT offset)
297{
298 if (offset == 0)
299 return base_align;
300 return MIN (base_align, offset & -offset);
301}
302
303/* Expand a block compare operation, and return true if successful.
304 Return false if we should let the compiler generate normal code,
305 probably a memcmp call.
306
307 OPERANDS[0] is the target (result).
308 OPERANDS[1] is the first source.
309 OPERANDS[2] is the second source.
310 OPERANDS[3] is the length.
311 OPERANDS[4] is the alignment. */
312bool
313expand_block_compare (rtx operands[])
314{
315 rtx target = operands[0];
316 rtx orig_src1 = operands[1];
317 rtx orig_src2 = operands[2];
318 rtx bytes_rtx = operands[3];
319 rtx align_rtx = operands[4];
320 HOST_WIDE_INT cmp_bytes = 0;
321 rtx src1 = orig_src1;
322 rtx src2 = orig_src2;
323
324 /* This case is complicated to handle because the subtract
325 with carry instructions do not generate the 64-bit
326 carry and so we must emit code to calculate it ourselves.
327 We choose not to implement this yet. */
328 if (TARGET_32BIT && TARGET_POWERPC64)
329 return false;
330
331 /* If this is not a fixed size compare, just call memcmp. */
332 if (!CONST_INT_P (bytes_rtx))
333 return false;
334
335 /* This must be a fixed size alignment. */
336 if (!CONST_INT_P (align_rtx))
337 return false;
338
339 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
340
341 /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff. */
342 if (SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src1))
343 || SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src2)))
344 return false;
345
346 gcc_assert (GET_MODE (target) == SImode);
347
348 /* Anything to move? */
349 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
350 if (bytes == 0)
351 return true;
352
353 /* The code generated for p7 and older is not faster than glibc
354 memcmp if alignment is small and length is not short, so bail
355 out to avoid those conditions. */
356 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
357 && ((base_align == 1 && bytes > 16)
358 || (base_align == 2 && bytes > 32)))
359 return false;
360
361 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
362 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
363 /* P7/P8 code uses cond for subfc. but P9 uses
364 it for cmpld which needs CCUNSmode. */
365 rtx cond;
366 if (TARGET_P9_MISC)
367 cond = gen_reg_rtx (CCUNSmode);
368 else
369 cond = gen_reg_rtx (CCmode);
370
371 /* If we have an LE target without ldbrx and word_mode is DImode,
372 then we must avoid using word_mode. */
373 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
374 && word_mode == DImode);
375
376 /* Strategy phase. How many ops will this take and should we expand it? */
377
378 unsigned HOST_WIDE_INT offset = 0;
379 machine_mode load_mode =
380 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
381 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
382
383 /* We don't want to generate too much code. */
384 unsigned HOST_WIDE_INT max_bytes =
385 load_mode_size * (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_limit;
386 if (!IN_RANGE (bytes, 1, max_bytes))
387 return false;
388
389 bool generate_6432_conversion = false;
390 rtx convert_label = NULL;
391 rtx final_label = NULL;
392
393 /* Example of generated code for 18 bytes aligned 1 byte.
394 Compiled with -fno-reorder-blocks for clarity.
395 ldbrx 10,31,8
396 ldbrx 9,7,8
397 subfc. 9,9,10
398 bne 0,.L6487
399 addi 9,12,8
400 addi 5,11,8
401 ldbrx 10,0,9
402 ldbrx 9,0,5
403 subfc. 9,9,10
404 bne 0,.L6487
405 addi 9,12,16
406 lhbrx 10,0,9
407 addi 9,11,16
408 lhbrx 9,0,9
409 subf 9,9,10
410 b .L6488
411 .p2align 4,,15
412 .L6487: #convert_label
413 popcntd 9,9
414 subfe 10,10,10
415 or 9,9,10
416 .L6488: #final_label
417 extsw 10,9
418
419 We start off with DImode for two blocks that jump to the DI->SI conversion
420 if the difference is found there, then a final block of HImode that skips
421 the DI->SI conversion. */
422
423 while (bytes > 0)
424 {
425 unsigned int align = compute_current_alignment (base_align, offset);
426 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
427 load_mode = select_block_compare_mode (offset, bytes, align,
428 word_mode_ok);
429 else
430 load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
431 load_mode_size = GET_MODE_SIZE (load_mode);
432 if (bytes >= load_mode_size)
433 cmp_bytes = load_mode_size;
434 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
435 {
436 /* Move this load back so it doesn't go past the end.
437 P8/P9 can do this efficiently. */
438 unsigned int extra_bytes = load_mode_size - bytes;
439 cmp_bytes = bytes;
440 if (extra_bytes < offset)
441 {
442 offset -= extra_bytes;
443 cmp_bytes = load_mode_size;
444 bytes = cmp_bytes;
445 }
446 }
447 else
448 /* P7 and earlier can't do the overlapping load trick fast,
449 so this forces a non-overlapping load and a shift to get
450 rid of the extra bytes. */
451 cmp_bytes = bytes;
452
453 src1 = adjust_address (orig_src1, load_mode, offset);
454 src2 = adjust_address (orig_src2, load_mode, offset);
455
456 if (!REG_P (XEXP (src1, 0)))
457 {
458 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
459 src1 = replace_equiv_address (src1, src1_reg);
460 }
461 set_mem_size (src1, cmp_bytes);
462
463 if (!REG_P (XEXP (src2, 0)))
464 {
465 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
466 src2 = replace_equiv_address (src2, src2_reg);
467 }
468 set_mem_size (src2, cmp_bytes);
469
470 do_load_for_compare (tmp_reg_src1, src1, load_mode);
471 do_load_for_compare (tmp_reg_src2, src2, load_mode);
472
473 if (cmp_bytes < load_mode_size)
474 {
475 /* Shift unneeded bytes off. */
476 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
477 if (word_mode == DImode)
478 {
479 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
480 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
481 }
482 else
483 {
484 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
485 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
486 }
487 }
488
489 int remain = bytes - cmp_bytes;
490 if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
491 {
492 /* Target is larger than load size so we don't need to
493 reduce result size. */
494
495 /* We previously did a block that need 64->32 conversion but
496 the current block does not, so a label is needed to jump
497 to the end. */
498 if (generate_6432_conversion && !final_label)
499 final_label = gen_label_rtx ();
500
501 if (remain > 0)
502 {
503 /* This is not the last block, branch to the end if the result
504 of this subtract is not zero. */
505 if (!final_label)
506 final_label = gen_label_rtx ();
507 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
508 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
509 rtx cr = gen_reg_rtx (CCmode);
510 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
511 emit_insn (gen_movsi (target,
512 gen_lowpart (SImode, tmp_reg_src2)));
513 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
514 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
515 fin_ref, pc_rtx);
516 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
517 JUMP_LABEL (j) = final_label;
518 LABEL_NUSES (final_label) += 1;
519 }
520 else
521 {
522 if (word_mode == DImode)
523 {
524 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
525 tmp_reg_src2));
526 emit_insn (gen_movsi (target,
527 gen_lowpart (SImode, tmp_reg_src2)));
528 }
529 else
530 emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2));
531
532 if (final_label)
533 {
534 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
535 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
536 JUMP_LABEL(j) = final_label;
537 LABEL_NUSES (final_label) += 1;
538 emit_barrier ();
539 }
540 }
541 }
542 else
543 {
544 /* Do we need a 64->32 conversion block? We need the 64->32
545 conversion even if target size == load_mode size because
546 the subtract generates one extra bit. */
547 generate_6432_conversion = true;
548
549 if (remain > 0)
550 {
551 if (!convert_label)
552 convert_label = gen_label_rtx ();
553
554 /* Compare to zero and branch to convert_label if not zero. */
555 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
556 if (TARGET_P9_MISC)
557 {
558 /* Generate a compare, and convert with a setb later. */
559 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
560 tmp_reg_src2);
561 emit_insn (gen_rtx_SET (cond, cmp));
562 }
563 else
564 /* Generate a subfc. and use the longer
565 sequence for conversion. */
566 if (TARGET_64BIT)
567 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
568 tmp_reg_src1, cond));
569 else
570 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
571 tmp_reg_src1, cond));
572 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
573 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
574 cvt_ref, pc_rtx);
575 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
576 JUMP_LABEL(j) = convert_label;
577 LABEL_NUSES (convert_label) += 1;
578 }
579 else
580 {
581 /* Just do the subtract/compare. Since this is the last block
582 the convert code will be generated immediately following. */
583 if (TARGET_P9_MISC)
584 {
585 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
586 tmp_reg_src2);
587 emit_insn (gen_rtx_SET (cond, cmp));
588 }
589 else
590 if (TARGET_64BIT)
591 emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
592 tmp_reg_src1));
593 else
594 emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
595 tmp_reg_src1));
596 }
597 }
598
599 offset += cmp_bytes;
600 bytes -= cmp_bytes;
601 }
602
603 if (generate_6432_conversion)
604 {
605 if (convert_label)
606 emit_label (convert_label);
607
608 /* We need to produce DI result from sub, then convert to target SI
609 while maintaining <0 / ==0 / >0 properties. This sequence works:
610 subfc L,A,B
611 subfe H,H,H
612 popcntd L,L
613 rldimi L,H,6,0
614
615 This is an alternate one Segher cooked up if somebody
616 wants to expand this for something that doesn't have popcntd:
617 subfc L,a,b
618 subfe H,x,x
619 addic t,L,-1
620 subfe v,t,L
621 or z,v,H
622
623 And finally, p9 can just do this:
624 cmpld A,B
625 setb r */
626
627 if (TARGET_P9_MISC)
628 {
629 emit_insn (gen_setb_unsigned (target, cond));
630 }
631 else
632 {
633 if (TARGET_64BIT)
634 {
635 rtx tmp_reg_ca = gen_reg_rtx (DImode);
636 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
637 emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
638 emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
639 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
640 }
641 else
642 {
643 rtx tmp_reg_ca = gen_reg_rtx (SImode);
644 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
645 emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
646 emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
647 }
648 }
649 }
650
651 if (final_label)
652 emit_label (final_label);
653
654 gcc_assert (bytes == 0);
655 return true;
656}
657
658/* Generate alignment check and branch code to set up for
659 strncmp when we don't have DI alignment.
660 STRNCMP_LABEL is the label to branch if there is a page crossing.
661 SRC is the string pointer to be examined.
662 BYTES is the max number of bytes to compare. */
663static void
664expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes)
665{
666 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
667 rtx src_check = copy_addr_to_reg (XEXP (src, 0));
668 if (GET_MODE (src_check) == SImode)
669 emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff)));
670 else
671 emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff)));
672 rtx cond = gen_reg_rtx (CCmode);
673 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check,
674 GEN_INT (4096 - bytes)));
675
676 rtx cmp_rtx = gen_rtx_LT (VOIDmode, cond, const0_rtx);
677
678 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
679 pc_rtx, lab_ref);
680 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
681 JUMP_LABEL (j) = strncmp_label;
682 LABEL_NUSES (strncmp_label) += 1;
683}
684
685/* Expand a string compare operation with length, and return
686 true if successful. Return false if we should let the
687 compiler generate normal code, probably a strncmp call.
688
689 OPERANDS[0] is the target (result).
690 OPERANDS[1] is the first source.
691 OPERANDS[2] is the second source.
692 If NO_LENGTH is zero, then:
693 OPERANDS[3] is the length.
694 OPERANDS[4] is the alignment in bytes.
695 If NO_LENGTH is nonzero, then:
696 OPERANDS[3] is the alignment in bytes. */
697bool
698expand_strn_compare (rtx operands[], int no_length)
699{
700 rtx target = operands[0];
701 rtx orig_src1 = operands[1];
702 rtx orig_src2 = operands[2];
703 rtx bytes_rtx, align_rtx;
704 if (no_length)
705 {
706 bytes_rtx = NULL;
707 align_rtx = operands[3];
708 }
709 else
710 {
711 bytes_rtx = operands[3];
712 align_rtx = operands[4];
713 }
714 unsigned HOST_WIDE_INT cmp_bytes = 0;
715 rtx src1 = orig_src1;
716 rtx src2 = orig_src2;
717
718 /* If we have a length, it must be constant. This simplifies things
719 a bit as we don't have to generate code to check if we've exceeded
720 the length. Later this could be expanded to handle this case. */
721 if (!no_length && !CONST_INT_P (bytes_rtx))
722 return false;
723
724 /* This must be a fixed size alignment. */
725 if (!CONST_INT_P (align_rtx))
726 return false;
727
728 unsigned int base_align = UINTVAL (align_rtx);
729 int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
730 int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
731
732 /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff. */
733 if (SLOW_UNALIGNED_ACCESS (word_mode, align1)
734 || SLOW_UNALIGNED_ACCESS (word_mode, align2))
735 return false;
736
737 gcc_assert (GET_MODE (target) == SImode);
738
739 /* If we have an LE target without ldbrx and word_mode is DImode,
740 then we must avoid using word_mode. */
741 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
742 && word_mode == DImode);
743
744 unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
745
746 unsigned HOST_WIDE_INT offset = 0;
747 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */
748 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */
749 if (no_length)
750 /* Use this as a standin to determine the mode to use. */
751 bytes = rs6000_string_compare_inline_limit * word_mode_size;
752 else
753 bytes = UINTVAL (bytes_rtx);
754
755 machine_mode load_mode =
756 select_block_compare_mode (offset, bytes, base_align, word_mode_ok);
757 unsigned int load_mode_size = GET_MODE_SIZE (load_mode);
758 compare_length = rs6000_string_compare_inline_limit * load_mode_size;
759
760 /* If we have equality at the end of the last compare and we have not
761 found the end of the string, we need to call strcmp/strncmp to
762 compare the remainder. */
763 bool equality_compare_rest = false;
764
765 if (no_length)
766 {
767 bytes = compare_length;
768 equality_compare_rest = true;
769 }
770 else
771 {
772 if (bytes <= compare_length)
773 compare_length = bytes;
774 else
775 equality_compare_rest = true;
776 }
777
778 rtx result_reg = gen_reg_rtx (word_mode);
779 rtx final_move_label = gen_label_rtx ();
780 rtx final_label = gen_label_rtx ();
781 rtx begin_compare_label = NULL;
782
783 if (base_align < 8)
784 {
785 /* Generate code that checks distance to 4k boundary for this case. */
786 begin_compare_label = gen_label_rtx ();
787 rtx strncmp_label = gen_label_rtx ();
788 rtx jmp;
789
790 /* Strncmp for power8 in glibc does this:
791 rldicl r8,r3,0,52
792 cmpldi cr7,r8,4096-16
793 bgt cr7,L(pagecross) */
794
795 /* Make sure that the length we use for the alignment test and
796 the subsequent code generation are in agreement so we do not
797 go past the length we tested for a 4k boundary crossing. */
798 unsigned HOST_WIDE_INT align_test = compare_length;
799 if (align_test < 8)
800 {
801 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
802 base_align = align_test;
803 }
804 else
805 {
806 align_test = ROUND_UP (align_test, 8);
807 base_align = 8;
808 }
809
810 if (align1 < 8)
811 expand_strncmp_align_check (strncmp_label, src1, align_test);
812 if (align2 < 8)
813 expand_strncmp_align_check (strncmp_label, src2, align_test);
814
815 /* Now generate the following sequence:
816 - branch to begin_compare
817 - strncmp_label
818 - call to strncmp
819 - branch to final_label
820 - begin_compare_label */
821
822 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
823 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
824 JUMP_LABEL (jmp) = begin_compare_label;
825 LABEL_NUSES (begin_compare_label) += 1;
826 emit_barrier ();
827
828 emit_label (strncmp_label);
829
830 if (!REG_P (XEXP (src1, 0)))
831 {
832 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
833 src1 = replace_equiv_address (src1, src1_reg);
834 }
835
836 if (!REG_P (XEXP (src2, 0)))
837 {
838 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
839 src2 = replace_equiv_address (src2, src2_reg);
840 }
841
842 if (no_length)
843 {
844 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
845 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
846 target, LCT_NORMAL, GET_MODE (target), 2,
847 force_reg (Pmode, XEXP (src1, 0)), Pmode,
848 force_reg (Pmode, XEXP (src2, 0)), Pmode);
849 }
850 else
851 {
852 /* -m32 -mpowerpc64 results in word_mode being DImode even
853 though otherwise it is 32-bit. The length arg to strncmp
854 is a size_t which will be the same size as pointers. */
855 rtx len_rtx;
856 if (TARGET_64BIT)
857 len_rtx = gen_reg_rtx (DImode);
858 else
859 len_rtx = gen_reg_rtx (SImode);
860
861 emit_move_insn (len_rtx, bytes_rtx);
862
863 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
864 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
865 target, LCT_NORMAL, GET_MODE (target), 3,
866 force_reg (Pmode, XEXP (src1, 0)), Pmode,
867 force_reg (Pmode, XEXP (src2, 0)), Pmode,
868 len_rtx, GET_MODE (len_rtx));
869 }
870
871 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
872 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
873 JUMP_LABEL (jmp) = final_label;
874 LABEL_NUSES (final_label) += 1;
875 emit_barrier ();
876 emit_label (begin_compare_label);
877 }
878
879 rtx cleanup_label = NULL;
880 rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
881 rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
882
883 /* Generate sequence of ld/ldbrx, cmpb to compare out
884 to the length specified. */
885 unsigned HOST_WIDE_INT bytes_to_compare = compare_length;
886 while (bytes_to_compare > 0)
887 {
888 /* Compare sequence:
889 check each 8B with: ld/ld cmpd bne
890 If equal, use rldicr/cmpb to check for zero byte.
891 cleanup code at end:
892 cmpb get byte that differs
893 cmpb look for zero byte
894 orc combine
895 cntlzd get bit of first zero/diff byte
896 subfic convert for rldcl use
897 rldcl rldcl extract diff/zero byte
898 subf subtract for final result
899
900 The last compare can branch around the cleanup code if the
901 result is zero because the strings are exactly equal. */
902 unsigned int align = compute_current_alignment (base_align, offset);
903 if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
904 load_mode = select_block_compare_mode (offset, bytes_to_compare, align,
905 word_mode_ok);
906 else
907 load_mode = select_block_compare_mode (0, bytes_to_compare, align,
908 word_mode_ok);
909 load_mode_size = GET_MODE_SIZE (load_mode);
910 if (bytes_to_compare >= load_mode_size)
911 cmp_bytes = load_mode_size;
912 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
913 {
914 /* Move this load back so it doesn't go past the end.
915 P8/P9 can do this efficiently. */
916 unsigned int extra_bytes = load_mode_size - bytes_to_compare;
917 cmp_bytes = bytes_to_compare;
918 if (extra_bytes < offset)
919 {
920 offset -= extra_bytes;
921 cmp_bytes = load_mode_size;
922 bytes_to_compare = cmp_bytes;
923 }
924 }
925 else
926 /* P7 and earlier can't do the overlapping load trick fast,
927 so this forces a non-overlapping load and a shift to get
928 rid of the extra bytes. */
929 cmp_bytes = bytes_to_compare;
930
931 src1 = adjust_address (orig_src1, load_mode, offset);
932 src2 = adjust_address (orig_src2, load_mode, offset);
933
934 if (!REG_P (XEXP (src1, 0)))
935 {
936 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
937 src1 = replace_equiv_address (src1, src1_reg);
938 }
939 set_mem_size (src1, cmp_bytes);
940
941 if (!REG_P (XEXP (src2, 0)))
942 {
943 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
944 src2 = replace_equiv_address (src2, src2_reg);
945 }
946 set_mem_size (src2, cmp_bytes);
947
948 do_load_for_compare (tmp_reg_src1, src1, load_mode);
949 do_load_for_compare (tmp_reg_src2, src2, load_mode);
950
951 /* We must always left-align the data we read, and
952 clear any bytes to the right that are beyond the string.
953 Otherwise the cmpb sequence won't produce the correct
954 results. The beginning of the compare will be done
955 with word_mode so will not have any extra shifts or
956 clear rights. */
957
958 if (load_mode_size < word_mode_size)
959 {
960 /* Rotate left first. */
961 rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size));
962 if (word_mode == DImode)
963 {
964 emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh));
965 emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh));
966 }
967 else
968 {
969 emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh));
970 emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh));
971 }
972 }
973
974 if (cmp_bytes < word_mode_size)
975 {
976 /* Now clear right. This plus the rotate can be
977 turned into a rldicr instruction. */
978 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
979 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
980 if (word_mode == DImode)
981 {
982 emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
983 emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
984 }
985 else
986 {
987 emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask));
988 emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask));
989 }
990 }
991
992 /* Cases to handle. A and B are chunks of the two strings.
993 1: Not end of comparison:
994 A != B: branch to cleanup code to compute result.
995 A == B: check for 0 byte, next block if not found.
996 2: End of the inline comparison:
997 A != B: branch to cleanup code to compute result.
998 A == B: check for 0 byte, call strcmp/strncmp
999 3: compared requested N bytes:
1000 A == B: branch to result 0.
1001 A != B: cleanup code to compute result. */
1002
1003 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
1004
1005 rtx dst_label;
1006 if (remain > 0 || equality_compare_rest)
1007 {
1008 /* Branch to cleanup code, otherwise fall through to do
1009 more compares. */
1010 if (!cleanup_label)
1011 cleanup_label = gen_label_rtx ();
1012 dst_label = cleanup_label;
1013 }
1014 else
1015 /* Branch to end and produce result of 0. */
1016 dst_label = final_move_label;
1017
1018 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
1019 rtx cond = gen_reg_rtx (CCmode);
1020
1021 /* Always produce the 0 result, it is needed if
1022 cmpb finds a 0 byte in this chunk. */
1023 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1024 rs6000_emit_dot_insn (result_reg, tmp, 1, cond);
1025
1026 rtx cmp_rtx;
1027 if (remain == 0 && !equality_compare_rest)
1028 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
1029 else
1030 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1031
1032 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
1033 lab_ref, pc_rtx);
1034 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1035 JUMP_LABEL (j) = dst_label;
1036 LABEL_NUSES (dst_label) += 1;
1037
1038 if (remain > 0 || equality_compare_rest)
1039 {
1040 /* Generate a cmpb to test for a 0 byte and branch
1041 to final result if found. */
1042 rtx cmpb_zero = gen_reg_rtx (word_mode);
1043 rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
1044 rtx condz = gen_reg_rtx (CCmode);
1045 rtx zero_reg = gen_reg_rtx (word_mode);
1046 if (word_mode == SImode)
1047 {
1048 emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
1049 emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
1050 if (cmp_bytes < word_mode_size)
1051 {
1052 /* Don't want to look at zero bytes past end. */
1053 HOST_WIDE_INT mb =
1054 BITS_PER_UNIT * (word_mode_size - cmp_bytes);
1055 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
1056 emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask));
1057 }
1058 }
1059 else
1060 {
1061 emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
1062 emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
1063 if (cmp_bytes < word_mode_size)
1064 {
1065 /* Don't want to look at zero bytes past end. */
1066 HOST_WIDE_INT mb =
1067 BITS_PER_UNIT * (word_mode_size - cmp_bytes);
1068 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
1069 emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask));
1070 }
1071 }
1072
1073 emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg));
1074 rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx);
1075 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx,
1076 lab_ref_fin, pc_rtx);
1077 rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1078 JUMP_LABEL (j2) = final_move_label;
1079 LABEL_NUSES (final_move_label) += 1;
1080
1081 }
1082
1083 offset += cmp_bytes;
1084 bytes_to_compare -= cmp_bytes;
1085 }
1086
1087 if (equality_compare_rest)
1088 {
1089 /* Update pointers past what has been compared already. */
1090 src1 = adjust_address (orig_src1, load_mode, offset);
1091 src2 = adjust_address (orig_src2, load_mode, offset);
1092
1093 if (!REG_P (XEXP (src1, 0)))
1094 {
1095 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1096 src1 = replace_equiv_address (src1, src1_reg);
1097 }
1098 set_mem_size (src1, cmp_bytes);
1099
1100 if (!REG_P (XEXP (src2, 0)))
1101 {
1102 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1103 src2 = replace_equiv_address (src2, src2_reg);
1104 }
1105 set_mem_size (src2, cmp_bytes);
1106
1107 /* Construct call to strcmp/strncmp to compare the rest of the string. */
1108 if (no_length)
1109 {
1110 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
1111 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1112 target, LCT_NORMAL, GET_MODE (target), 2,
1113 force_reg (Pmode, XEXP (src1, 0)), Pmode,
1114 force_reg (Pmode, XEXP (src2, 0)), Pmode);
1115 }
1116 else
1117 {
1118 rtx len_rtx;
1119 if (TARGET_64BIT)
1120 len_rtx = gen_reg_rtx (DImode);
1121 else
1122 len_rtx = gen_reg_rtx (SImode);
1123
1124 emit_move_insn (len_rtx, GEN_INT (bytes - compare_length));
1125 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
1126 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1127 target, LCT_NORMAL, GET_MODE (target), 3,
1128 force_reg (Pmode, XEXP (src1, 0)), Pmode,
1129 force_reg (Pmode, XEXP (src2, 0)), Pmode,
1130 len_rtx, GET_MODE (len_rtx));
1131 }
1132
1133 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1134 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1135 JUMP_LABEL (jmp) = final_label;
1136 LABEL_NUSES (final_label) += 1;
1137 emit_barrier ();
1138 }
1139
1140 if (cleanup_label)
1141 emit_label (cleanup_label);
1142
1143 /* Generate the final sequence that identifies the differing
1144 byte and generates the final result, taking into account
1145 zero bytes:
1146
1147 cmpb cmpb_result1, src1, src2
1148 cmpb cmpb_result2, src1, zero
1149 orc cmpb_result1, cmp_result1, cmpb_result2
1150 cntlzd get bit of first zero/diff byte
1151 addi convert for rldcl use
1152 rldcl rldcl extract diff/zero byte
1153 subf subtract for final result
1154 */
1155
1156 rtx cmpb_diff = gen_reg_rtx (word_mode);
1157 rtx cmpb_zero = gen_reg_rtx (word_mode);
1158 rtx rot_amt = gen_reg_rtx (word_mode);
1159 rtx zero_reg = gen_reg_rtx (word_mode);
1160
1161 rtx rot1_1 = gen_reg_rtx (word_mode);
1162 rtx rot1_2 = gen_reg_rtx (word_mode);
1163 rtx rot2_1 = gen_reg_rtx (word_mode);
1164 rtx rot2_2 = gen_reg_rtx (word_mode);
1165
1166 if (word_mode == SImode)
1167 {
1168 emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
1169 emit_insn (gen_movsi (zero_reg, GEN_INT (0)));
1170 emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg));
1171 emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff));
1172 emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero));
1173 emit_insn (gen_clzsi2 (rot_amt, cmpb_diff));
1174 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
1175 emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1,
1176 gen_lowpart (SImode, rot_amt)));
1177 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
1178 emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2,
1179 gen_lowpart (SImode, rot_amt)));
1180 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
1181 emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2));
1182 }
1183 else
1184 {
1185 emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2));
1186 emit_insn (gen_movdi (zero_reg, GEN_INT (0)));
1187 emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg));
1188 emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff));
1189 emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero));
1190 emit_insn (gen_clzdi2 (rot_amt, cmpb_diff));
1191 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
1192 emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1,
1193 gen_lowpart (SImode, rot_amt)));
1194 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
1195 emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2,
1196 gen_lowpart (SImode, rot_amt)));
1197 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
1198 emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2));
1199 }
1200
1201 emit_label (final_move_label);
1202 emit_insn (gen_movsi (target,
1203 gen_lowpart (SImode, result_reg)));
1204 emit_label (final_label);
1205 return true;
1206}
1207
1208/* Expand a block move operation, and return 1 if successful. Return 0
1209 if we should let the compiler generate normal code.
1210
1211 operands[0] is the destination
1212 operands[1] is the source
1213 operands[2] is the length
1214 operands[3] is the alignment */
1215
1216#define MAX_MOVE_REG 4
1217
1218int
1219expand_block_move (rtx operands[])
1220{
1221 rtx orig_dest = operands[0];
1222 rtx orig_src = operands[1];
1223 rtx bytes_rtx = operands[2];
1224 rtx align_rtx = operands[3];
1225 int constp = (GET_CODE (bytes_rtx) == CONST_INT);
1226 int align;
1227 int bytes;
1228 int offset;
1229 int move_bytes;
1230 rtx stores[MAX_MOVE_REG];
1231 int num_reg = 0;
1232
1233 /* If this is not a fixed size move, just call memcpy */
1234 if (! constp)
1235 return 0;
1236
1237 /* This must be a fixed size alignment */
1238 gcc_assert (GET_CODE (align_rtx) == CONST_INT);
1239 align = INTVAL (align_rtx) * BITS_PER_UNIT;
1240
1241 /* Anything to move? */
1242 bytes = INTVAL (bytes_rtx);
1243 if (bytes <= 0)
1244 return 1;
1245
1246 if (bytes > rs6000_block_move_inline_limit)
1247 return 0;
1248
1249 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
1250 {
1251 union {
1252 rtx (*movmemsi) (rtx, rtx, rtx, rtx);
1253 rtx (*mov) (rtx, rtx);
1254 } gen_func;
1255 machine_mode mode = BLKmode;
1256 rtx src, dest;
1257
1258 /* Altivec first, since it will be faster than a string move
1259 when it applies, and usually not significantly larger. */
1260 if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
1261 {
1262 move_bytes = 16;
1263 mode = V4SImode;
1264 gen_func.mov = gen_movv4si;
1265 }
1266 else if (TARGET_STRING
1267 && bytes > 24 /* move up to 32 bytes at a time */
1268 && ! fixed_regs[5]
1269 && ! fixed_regs[6]
1270 && ! fixed_regs[7]
1271 && ! fixed_regs[8]
1272 && ! fixed_regs[9]
1273 && ! fixed_regs[10]
1274 && ! fixed_regs[11]
1275 && ! fixed_regs[12])
1276 {
1277 move_bytes = (bytes > 32) ? 32 : bytes;
1278 gen_func.movmemsi = gen_movmemsi_8reg;
1279 }
1280 else if (TARGET_STRING
1281 && bytes > 16 /* move up to 24 bytes at a time */
1282 && ! fixed_regs[5]
1283 && ! fixed_regs[6]
1284 && ! fixed_regs[7]
1285 && ! fixed_regs[8]
1286 && ! fixed_regs[9]
1287 && ! fixed_regs[10])
1288 {
1289 move_bytes = (bytes > 24) ? 24 : bytes;
1290 gen_func.movmemsi = gen_movmemsi_6reg;
1291 }
1292 else if (TARGET_STRING
1293 && bytes > 8 /* move up to 16 bytes at a time */
1294 && ! fixed_regs[5]
1295 && ! fixed_regs[6]
1296 && ! fixed_regs[7]
1297 && ! fixed_regs[8])
1298 {
1299 move_bytes = (bytes > 16) ? 16 : bytes;
1300 gen_func.movmemsi = gen_movmemsi_4reg;
1301 }
1302 else if (bytes >= 8 && TARGET_POWERPC64
1303 && (align >= 64 || !STRICT_ALIGNMENT))
1304 {
1305 move_bytes = 8;
1306 mode = DImode;
1307 gen_func.mov = gen_movdi;
1308 if (offset == 0 && align < 64)
1309 {
1310 rtx addr;
1311
1312 /* If the address form is reg+offset with offset not a
1313 multiple of four, reload into reg indirect form here
1314 rather than waiting for reload. This way we get one
1315 reload, not one per load and/or store. */
1316 addr = XEXP (orig_dest, 0);
1317 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
1318 && GET_CODE (XEXP (addr, 1)) == CONST_INT
1319 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
1320 {
1321 addr = copy_addr_to_reg (addr);
1322 orig_dest = replace_equiv_address (orig_dest, addr);
1323 }
1324 addr = XEXP (orig_src, 0);
1325 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
1326 && GET_CODE (XEXP (addr, 1)) == CONST_INT
1327 && (INTVAL (XEXP (addr, 1)) & 3) != 0)
1328 {
1329 addr = copy_addr_to_reg (addr);
1330 orig_src = replace_equiv_address (orig_src, addr);
1331 }
1332 }
1333 }
1334 else if (TARGET_STRING && bytes > 4 && !TARGET_POWERPC64)
1335 { /* move up to 8 bytes at a time */
1336 move_bytes = (bytes > 8) ? 8 : bytes;
1337 gen_func.movmemsi = gen_movmemsi_2reg;
1338 }
1339 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
1340 { /* move 4 bytes */
1341 move_bytes = 4;
1342 mode = SImode;
1343 gen_func.mov = gen_movsi;
1344 }
1345 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
1346 { /* move 2 bytes */
1347 move_bytes = 2;
1348 mode = HImode;
1349 gen_func.mov = gen_movhi;
1350 }
1351 else if (TARGET_STRING && bytes > 1)
1352 { /* move up to 4 bytes at a time */
1353 move_bytes = (bytes > 4) ? 4 : bytes;
1354 gen_func.movmemsi = gen_movmemsi_1reg;
1355 }
1356 else /* move 1 byte at a time */
1357 {
1358 move_bytes = 1;
1359 mode = QImode;
1360 gen_func.mov = gen_movqi;
1361 }
1362
1363 src = adjust_address (orig_src, mode, offset);
1364 dest = adjust_address (orig_dest, mode, offset);
1365
1366 if (mode != BLKmode)
1367 {
1368 rtx tmp_reg = gen_reg_rtx (mode);
1369
1370 emit_insn ((*gen_func.mov) (tmp_reg, src));
1371 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
1372 }
1373
1374 if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes)
1375 {
1376 int i;
1377 for (i = 0; i < num_reg; i++)
1378 emit_insn (stores[i]);
1379 num_reg = 0;
1380 }
1381
1382 if (mode == BLKmode)
1383 {
1384 /* Move the address into scratch registers. The movmemsi
1385 patterns require zero offset. */
1386 if (!REG_P (XEXP (src, 0)))
1387 {
1388 rtx src_reg = copy_addr_to_reg (XEXP (src, 0));
1389 src = replace_equiv_address (src, src_reg);
1390 }
1391 set_mem_size (src, move_bytes);
1392
1393 if (!REG_P (XEXP (dest, 0)))
1394 {
1395 rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0));
1396 dest = replace_equiv_address (dest, dest_reg);
1397 }
1398 set_mem_size (dest, move_bytes);
1399
1400 emit_insn ((*gen_func.movmemsi) (dest, src,
1401 GEN_INT (move_bytes & 31),
1402 align_rtx));
1403 }
1404 }
1405
1406 return 1;
1407}
1408
1409\f
1410/* Return a string to perform a load_multiple operation.
1411 operands[0] is the vector.
1412 operands[1] is the source address.
1413 operands[2] is the first destination register. */
1414
1415const char *
1416rs6000_output_load_multiple (rtx operands[3])
1417{
1418 /* We have to handle the case where the pseudo used to contain the address
1419 is assigned to one of the output registers. */
1420 int i, j;
1421 int words = XVECLEN (operands[0], 0);
1422 rtx xop[10];
1423
1424 if (XVECLEN (operands[0], 0) == 1)
1425 return "lwz %2,0(%1)";
1426
1427 for (i = 0; i < words; i++)
1428 if (refers_to_regno_p (REGNO (operands[2]) + i, operands[1]))
1429 {
1430 if (i == words-1)
1431 {
1432 xop[0] = GEN_INT (4 * (words-1));
1433 xop[1] = operands[1];
1434 xop[2] = operands[2];
1435 output_asm_insn ("lswi %2,%1,%0\n\tlwz %1,%0(%1)", xop);
1436 return "";
1437 }
1438 else if (i == 0)
1439 {
1440 xop[0] = GEN_INT (4 * (words-1));
1441 xop[1] = operands[1];
1442 xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + 1);
1443 output_asm_insn ("addi %1,%1,4\n\tlswi %2,%1,%0\n\tlwz %1,-4(%1)", xop);
1444 return "";
1445 }
1446 else
1447 {
1448 for (j = 0; j < words; j++)
1449 if (j != i)
1450 {
1451 xop[0] = GEN_INT (j * 4);
1452 xop[1] = operands[1];
1453 xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + j);
1454 output_asm_insn ("lwz %2,%0(%1)", xop);
1455 }
1456 xop[0] = GEN_INT (i * 4);
1457 xop[1] = operands[1];
1458 output_asm_insn ("lwz %1,%0(%1)", xop);
1459 return "";
1460 }
1461 }
1462
1463 return "lswi %2,%1,%N0";
1464}
1465