]>
Commit | Line | Data |
---|---|---|
8845cb37 AS |
1 | /* Subroutines used to expand string and block move, clear, |
2 | compare and other operations for PowerPC. | |
3 | Copyright (C) 1991-2017 Free Software Foundation, Inc. | |
4 | ||
5 | This file is part of GCC. | |
6 | ||
7 | GCC is free software; you can redistribute it and/or modify it | |
8 | under the terms of the GNU General Public License as published | |
9 | by the Free Software Foundation; either version 3, or (at your | |
10 | option) any later version. | |
11 | ||
12 | GCC is distributed in the hope that it will be useful, but WITHOUT | |
13 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
14 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | |
15 | License for more details. | |
16 | ||
17 | You should have received a copy of the GNU General Public License | |
18 | along with GCC; see the file COPYING3. If not see | |
19 | <http://www.gnu.org/licenses/>. */ | |
20 | ||
21 | #include "config.h" | |
22 | #include "system.h" | |
23 | #include "coretypes.h" | |
24 | #include "backend.h" | |
25 | #include "rtl.h" | |
26 | #include "tree.h" | |
27 | #include "memmodel.h" | |
28 | #include "tm_p.h" | |
29 | #include "ira.h" | |
30 | #include "print-tree.h" | |
31 | #include "varasm.h" | |
32 | #include "explow.h" | |
33 | #include "expr.h" | |
34 | #include "output.h" | |
35 | ||
36 | /* Expand a block clear operation, and return 1 if successful. Return 0 | |
37 | if we should let the compiler generate normal code. | |
38 | ||
39 | operands[0] is the destination | |
40 | operands[1] is the length | |
41 | operands[3] is the alignment */ | |
42 | ||
43 | int | |
44 | expand_block_clear (rtx operands[]) | |
45 | { | |
46 | rtx orig_dest = operands[0]; | |
47 | rtx bytes_rtx = operands[1]; | |
48 | rtx align_rtx = operands[3]; | |
49 | bool constp = (GET_CODE (bytes_rtx) == CONST_INT); | |
50 | HOST_WIDE_INT align; | |
51 | HOST_WIDE_INT bytes; | |
52 | int offset; | |
53 | int clear_bytes; | |
54 | int clear_step; | |
55 | ||
56 | /* If this is not a fixed size move, just call memcpy */ | |
57 | if (! constp) | |
58 | return 0; | |
59 | ||
60 | /* This must be a fixed size alignment */ | |
61 | gcc_assert (GET_CODE (align_rtx) == CONST_INT); | |
62 | align = INTVAL (align_rtx) * BITS_PER_UNIT; | |
63 | ||
64 | /* Anything to clear? */ | |
65 | bytes = INTVAL (bytes_rtx); | |
66 | if (bytes <= 0) | |
67 | return 1; | |
68 | ||
69 | /* Use the builtin memset after a point, to avoid huge code bloat. | |
70 | When optimize_size, avoid any significant code bloat; calling | |
71 | memset is about 4 instructions, so allow for one instruction to | |
72 | load zero and three to do clearing. */ | |
73 | if (TARGET_ALTIVEC && align >= 128) | |
74 | clear_step = 16; | |
75 | else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT)) | |
76 | clear_step = 8; | |
77 | else | |
78 | clear_step = 4; | |
79 | ||
80 | if (optimize_size && bytes > 3 * clear_step) | |
81 | return 0; | |
82 | if (! optimize_size && bytes > 8 * clear_step) | |
83 | return 0; | |
84 | ||
85 | for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes) | |
86 | { | |
87 | machine_mode mode = BLKmode; | |
88 | rtx dest; | |
89 | ||
90 | if (bytes >= 16 && TARGET_ALTIVEC && align >= 128) | |
91 | { | |
92 | clear_bytes = 16; | |
93 | mode = V4SImode; | |
94 | } | |
95 | else if (bytes >= 8 && TARGET_POWERPC64 | |
96 | && (align >= 64 || !STRICT_ALIGNMENT)) | |
97 | { | |
98 | clear_bytes = 8; | |
99 | mode = DImode; | |
100 | if (offset == 0 && align < 64) | |
101 | { | |
102 | rtx addr; | |
103 | ||
104 | /* If the address form is reg+offset with offset not a | |
105 | multiple of four, reload into reg indirect form here | |
106 | rather than waiting for reload. This way we get one | |
107 | reload, not one per store. */ | |
108 | addr = XEXP (orig_dest, 0); | |
109 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
110 | && GET_CODE (XEXP (addr, 1)) == CONST_INT | |
111 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) | |
112 | { | |
113 | addr = copy_addr_to_reg (addr); | |
114 | orig_dest = replace_equiv_address (orig_dest, addr); | |
115 | } | |
116 | } | |
117 | } | |
118 | else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) | |
119 | { /* move 4 bytes */ | |
120 | clear_bytes = 4; | |
121 | mode = SImode; | |
122 | } | |
123 | else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) | |
124 | { /* move 2 bytes */ | |
125 | clear_bytes = 2; | |
126 | mode = HImode; | |
127 | } | |
128 | else /* move 1 byte at a time */ | |
129 | { | |
130 | clear_bytes = 1; | |
131 | mode = QImode; | |
132 | } | |
133 | ||
134 | dest = adjust_address (orig_dest, mode, offset); | |
135 | ||
136 | emit_move_insn (dest, CONST0_RTX (mode)); | |
137 | } | |
138 | ||
139 | return 1; | |
140 | } | |
141 | ||
142 | /* Figure out the correct instructions to generate to load data for | |
143 | block compare. MODE is used for the read from memory, and | |
144 | data is zero extended if REG is wider than MODE. If LE code | |
145 | is being generated, bswap loads are used. | |
146 | ||
147 | REG is the destination register to move the data into. | |
148 | MEM is the memory block being read. | |
149 | MODE is the mode of memory to use for the read. */ | |
150 | static void | |
151 | do_load_for_compare (rtx reg, rtx mem, machine_mode mode) | |
152 | { | |
153 | switch (GET_MODE (reg)) | |
154 | { | |
155 | case DImode: | |
156 | switch (mode) | |
157 | { | |
158 | case QImode: | |
159 | emit_insn (gen_zero_extendqidi2 (reg, mem)); | |
160 | break; | |
161 | case HImode: | |
162 | { | |
163 | rtx src = mem; | |
164 | if (!BYTES_BIG_ENDIAN) | |
165 | { | |
166 | src = gen_reg_rtx (HImode); | |
167 | emit_insn (gen_bswaphi2 (src, mem)); | |
168 | } | |
169 | emit_insn (gen_zero_extendhidi2 (reg, src)); | |
170 | break; | |
171 | } | |
172 | case SImode: | |
173 | { | |
174 | rtx src = mem; | |
175 | if (!BYTES_BIG_ENDIAN) | |
176 | { | |
177 | src = gen_reg_rtx (SImode); | |
178 | emit_insn (gen_bswapsi2 (src, mem)); | |
179 | } | |
180 | emit_insn (gen_zero_extendsidi2 (reg, src)); | |
181 | } | |
182 | break; | |
183 | case DImode: | |
184 | if (!BYTES_BIG_ENDIAN) | |
185 | emit_insn (gen_bswapdi2 (reg, mem)); | |
186 | else | |
187 | emit_insn (gen_movdi (reg, mem)); | |
188 | break; | |
189 | default: | |
190 | gcc_unreachable (); | |
191 | } | |
192 | break; | |
193 | ||
194 | case SImode: | |
195 | switch (mode) | |
196 | { | |
197 | case QImode: | |
198 | emit_insn (gen_zero_extendqisi2 (reg, mem)); | |
199 | break; | |
200 | case HImode: | |
201 | { | |
202 | rtx src = mem; | |
203 | if (!BYTES_BIG_ENDIAN) | |
204 | { | |
205 | src = gen_reg_rtx (HImode); | |
206 | emit_insn (gen_bswaphi2 (src, mem)); | |
207 | } | |
208 | emit_insn (gen_zero_extendhisi2 (reg, src)); | |
209 | break; | |
210 | } | |
211 | case SImode: | |
212 | if (!BYTES_BIG_ENDIAN) | |
213 | emit_insn (gen_bswapsi2 (reg, mem)); | |
214 | else | |
215 | emit_insn (gen_movsi (reg, mem)); | |
216 | break; | |
217 | case DImode: | |
218 | /* DImode is larger than the destination reg so is not expected. */ | |
219 | gcc_unreachable (); | |
220 | break; | |
221 | default: | |
222 | gcc_unreachable (); | |
223 | } | |
224 | break; | |
225 | default: | |
226 | gcc_unreachable (); | |
227 | break; | |
228 | } | |
229 | } | |
230 | ||
231 | /* Select the mode to be used for reading the next chunk of bytes | |
232 | in the compare. | |
233 | ||
234 | OFFSET is the current read offset from the beginning of the block. | |
235 | BYTES is the number of bytes remaining to be read. | |
236 | ALIGN is the minimum alignment of the memory blocks being compared in bytes. | |
237 | WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is | |
238 | the largest allowable mode. */ | |
239 | static machine_mode | |
240 | select_block_compare_mode (unsigned HOST_WIDE_INT offset, | |
241 | unsigned HOST_WIDE_INT bytes, | |
242 | unsigned HOST_WIDE_INT align, bool word_mode_ok) | |
243 | { | |
244 | /* First see if we can do a whole load unit | |
245 | as that will be more efficient than a larger load + shift. */ | |
246 | ||
247 | /* If big, use biggest chunk. | |
248 | If exactly chunk size, use that size. | |
249 | If remainder can be done in one piece with shifting, do that. | |
250 | Do largest chunk possible without violating alignment rules. */ | |
251 | ||
252 | /* The most we can read without potential page crossing. */ | |
253 | unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align); | |
254 | ||
255 | if (word_mode_ok && bytes >= UNITS_PER_WORD) | |
256 | return word_mode; | |
257 | else if (bytes == GET_MODE_SIZE (SImode)) | |
258 | return SImode; | |
259 | else if (bytes == GET_MODE_SIZE (HImode)) | |
260 | return HImode; | |
261 | else if (bytes == GET_MODE_SIZE (QImode)) | |
262 | return QImode; | |
263 | else if (bytes < GET_MODE_SIZE (SImode) | |
264 | && offset >= GET_MODE_SIZE (SImode) - bytes) | |
265 | /* This matches the case were we have SImode and 3 bytes | |
266 | and offset >= 1 and permits us to move back one and overlap | |
267 | with the previous read, thus avoiding having to shift | |
268 | unwanted bytes off of the input. */ | |
269 | return SImode; | |
270 | else if (word_mode_ok && bytes < UNITS_PER_WORD | |
271 | && offset >= UNITS_PER_WORD-bytes) | |
272 | /* Similarly, if we can use DImode it will get matched here and | |
273 | can do an overlapping read that ends at the end of the block. */ | |
274 | return word_mode; | |
275 | else if (word_mode_ok && maxread >= UNITS_PER_WORD) | |
276 | /* It is safe to do all remaining in one load of largest size, | |
277 | possibly with a shift to get rid of unwanted bytes. */ | |
278 | return word_mode; | |
279 | else if (maxread >= GET_MODE_SIZE (SImode)) | |
280 | /* It is safe to do all remaining in one SImode load, | |
281 | possibly with a shift to get rid of unwanted bytes. */ | |
282 | return SImode; | |
283 | else if (bytes > GET_MODE_SIZE (SImode)) | |
284 | return SImode; | |
285 | else if (bytes > GET_MODE_SIZE (HImode)) | |
286 | return HImode; | |
287 | ||
288 | /* final fallback is do one byte */ | |
289 | return QImode; | |
290 | } | |
291 | ||
292 | /* Compute the alignment of pointer+OFFSET where the original alignment | |
293 | of pointer was BASE_ALIGN. */ | |
294 | static unsigned HOST_WIDE_INT | |
295 | compute_current_alignment (unsigned HOST_WIDE_INT base_align, | |
296 | unsigned HOST_WIDE_INT offset) | |
297 | { | |
298 | if (offset == 0) | |
299 | return base_align; | |
300 | return MIN (base_align, offset & -offset); | |
301 | } | |
302 | ||
303 | /* Expand a block compare operation, and return true if successful. | |
304 | Return false if we should let the compiler generate normal code, | |
305 | probably a memcmp call. | |
306 | ||
307 | OPERANDS[0] is the target (result). | |
308 | OPERANDS[1] is the first source. | |
309 | OPERANDS[2] is the second source. | |
310 | OPERANDS[3] is the length. | |
311 | OPERANDS[4] is the alignment. */ | |
312 | bool | |
313 | expand_block_compare (rtx operands[]) | |
314 | { | |
315 | rtx target = operands[0]; | |
316 | rtx orig_src1 = operands[1]; | |
317 | rtx orig_src2 = operands[2]; | |
318 | rtx bytes_rtx = operands[3]; | |
319 | rtx align_rtx = operands[4]; | |
320 | HOST_WIDE_INT cmp_bytes = 0; | |
321 | rtx src1 = orig_src1; | |
322 | rtx src2 = orig_src2; | |
323 | ||
324 | /* This case is complicated to handle because the subtract | |
325 | with carry instructions do not generate the 64-bit | |
326 | carry and so we must emit code to calculate it ourselves. | |
327 | We choose not to implement this yet. */ | |
328 | if (TARGET_32BIT && TARGET_POWERPC64) | |
329 | return false; | |
330 | ||
331 | /* If this is not a fixed size compare, just call memcmp. */ | |
332 | if (!CONST_INT_P (bytes_rtx)) | |
333 | return false; | |
334 | ||
335 | /* This must be a fixed size alignment. */ | |
336 | if (!CONST_INT_P (align_rtx)) | |
337 | return false; | |
338 | ||
339 | unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT; | |
340 | ||
341 | /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff. */ | |
342 | if (SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src1)) | |
343 | || SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src2))) | |
344 | return false; | |
345 | ||
346 | gcc_assert (GET_MODE (target) == SImode); | |
347 | ||
348 | /* Anything to move? */ | |
349 | unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx); | |
350 | if (bytes == 0) | |
351 | return true; | |
352 | ||
353 | /* The code generated for p7 and older is not faster than glibc | |
354 | memcmp if alignment is small and length is not short, so bail | |
355 | out to avoid those conditions. */ | |
356 | if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED | |
357 | && ((base_align == 1 && bytes > 16) | |
358 | || (base_align == 2 && bytes > 32))) | |
359 | return false; | |
360 | ||
361 | rtx tmp_reg_src1 = gen_reg_rtx (word_mode); | |
362 | rtx tmp_reg_src2 = gen_reg_rtx (word_mode); | |
363 | /* P7/P8 code uses cond for subfc. but P9 uses | |
364 | it for cmpld which needs CCUNSmode. */ | |
365 | rtx cond; | |
366 | if (TARGET_P9_MISC) | |
367 | cond = gen_reg_rtx (CCUNSmode); | |
368 | else | |
369 | cond = gen_reg_rtx (CCmode); | |
370 | ||
371 | /* If we have an LE target without ldbrx and word_mode is DImode, | |
372 | then we must avoid using word_mode. */ | |
373 | int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX | |
374 | && word_mode == DImode); | |
375 | ||
376 | /* Strategy phase. How many ops will this take and should we expand it? */ | |
377 | ||
378 | unsigned HOST_WIDE_INT offset = 0; | |
379 | machine_mode load_mode = | |
380 | select_block_compare_mode (offset, bytes, base_align, word_mode_ok); | |
381 | unsigned int load_mode_size = GET_MODE_SIZE (load_mode); | |
382 | ||
383 | /* We don't want to generate too much code. */ | |
384 | unsigned HOST_WIDE_INT max_bytes = | |
385 | load_mode_size * (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_limit; | |
386 | if (!IN_RANGE (bytes, 1, max_bytes)) | |
387 | return false; | |
388 | ||
389 | bool generate_6432_conversion = false; | |
390 | rtx convert_label = NULL; | |
391 | rtx final_label = NULL; | |
392 | ||
393 | /* Example of generated code for 18 bytes aligned 1 byte. | |
394 | Compiled with -fno-reorder-blocks for clarity. | |
395 | ldbrx 10,31,8 | |
396 | ldbrx 9,7,8 | |
397 | subfc. 9,9,10 | |
398 | bne 0,.L6487 | |
399 | addi 9,12,8 | |
400 | addi 5,11,8 | |
401 | ldbrx 10,0,9 | |
402 | ldbrx 9,0,5 | |
403 | subfc. 9,9,10 | |
404 | bne 0,.L6487 | |
405 | addi 9,12,16 | |
406 | lhbrx 10,0,9 | |
407 | addi 9,11,16 | |
408 | lhbrx 9,0,9 | |
409 | subf 9,9,10 | |
410 | b .L6488 | |
411 | .p2align 4,,15 | |
412 | .L6487: #convert_label | |
413 | popcntd 9,9 | |
414 | subfe 10,10,10 | |
415 | or 9,9,10 | |
416 | .L6488: #final_label | |
417 | extsw 10,9 | |
418 | ||
419 | We start off with DImode for two blocks that jump to the DI->SI conversion | |
420 | if the difference is found there, then a final block of HImode that skips | |
421 | the DI->SI conversion. */ | |
422 | ||
423 | while (bytes > 0) | |
424 | { | |
425 | unsigned int align = compute_current_alignment (base_align, offset); | |
426 | if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
427 | load_mode = select_block_compare_mode (offset, bytes, align, | |
428 | word_mode_ok); | |
429 | else | |
430 | load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok); | |
431 | load_mode_size = GET_MODE_SIZE (load_mode); | |
432 | if (bytes >= load_mode_size) | |
433 | cmp_bytes = load_mode_size; | |
434 | else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
435 | { | |
436 | /* Move this load back so it doesn't go past the end. | |
437 | P8/P9 can do this efficiently. */ | |
438 | unsigned int extra_bytes = load_mode_size - bytes; | |
439 | cmp_bytes = bytes; | |
440 | if (extra_bytes < offset) | |
441 | { | |
442 | offset -= extra_bytes; | |
443 | cmp_bytes = load_mode_size; | |
444 | bytes = cmp_bytes; | |
445 | } | |
446 | } | |
447 | else | |
448 | /* P7 and earlier can't do the overlapping load trick fast, | |
449 | so this forces a non-overlapping load and a shift to get | |
450 | rid of the extra bytes. */ | |
451 | cmp_bytes = bytes; | |
452 | ||
453 | src1 = adjust_address (orig_src1, load_mode, offset); | |
454 | src2 = adjust_address (orig_src2, load_mode, offset); | |
455 | ||
456 | if (!REG_P (XEXP (src1, 0))) | |
457 | { | |
458 | rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
459 | src1 = replace_equiv_address (src1, src1_reg); | |
460 | } | |
461 | set_mem_size (src1, cmp_bytes); | |
462 | ||
463 | if (!REG_P (XEXP (src2, 0))) | |
464 | { | |
465 | rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
466 | src2 = replace_equiv_address (src2, src2_reg); | |
467 | } | |
468 | set_mem_size (src2, cmp_bytes); | |
469 | ||
470 | do_load_for_compare (tmp_reg_src1, src1, load_mode); | |
471 | do_load_for_compare (tmp_reg_src2, src2, load_mode); | |
472 | ||
473 | if (cmp_bytes < load_mode_size) | |
474 | { | |
475 | /* Shift unneeded bytes off. */ | |
476 | rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes)); | |
477 | if (word_mode == DImode) | |
478 | { | |
479 | emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
480 | emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
481 | } | |
482 | else | |
483 | { | |
484 | emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
485 | emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
486 | } | |
487 | } | |
488 | ||
489 | int remain = bytes - cmp_bytes; | |
490 | if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode)) | |
491 | { | |
492 | /* Target is larger than load size so we don't need to | |
493 | reduce result size. */ | |
494 | ||
495 | /* We previously did a block that need 64->32 conversion but | |
496 | the current block does not, so a label is needed to jump | |
497 | to the end. */ | |
498 | if (generate_6432_conversion && !final_label) | |
499 | final_label = gen_label_rtx (); | |
500 | ||
501 | if (remain > 0) | |
502 | { | |
503 | /* This is not the last block, branch to the end if the result | |
504 | of this subtract is not zero. */ | |
505 | if (!final_label) | |
506 | final_label = gen_label_rtx (); | |
507 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
508 | rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); | |
509 | rtx cr = gen_reg_rtx (CCmode); | |
510 | rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr); | |
511 | emit_insn (gen_movsi (target, | |
512 | gen_lowpart (SImode, tmp_reg_src2))); | |
513 | rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx); | |
514 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, | |
515 | fin_ref, pc_rtx); | |
516 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
517 | JUMP_LABEL (j) = final_label; | |
518 | LABEL_NUSES (final_label) += 1; | |
519 | } | |
520 | else | |
521 | { | |
522 | if (word_mode == DImode) | |
523 | { | |
524 | emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1, | |
525 | tmp_reg_src2)); | |
526 | emit_insn (gen_movsi (target, | |
527 | gen_lowpart (SImode, tmp_reg_src2))); | |
528 | } | |
529 | else | |
530 | emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2)); | |
531 | ||
532 | if (final_label) | |
533 | { | |
534 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
535 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
536 | JUMP_LABEL(j) = final_label; | |
537 | LABEL_NUSES (final_label) += 1; | |
538 | emit_barrier (); | |
539 | } | |
540 | } | |
541 | } | |
542 | else | |
543 | { | |
544 | /* Do we need a 64->32 conversion block? We need the 64->32 | |
545 | conversion even if target size == load_mode size because | |
546 | the subtract generates one extra bit. */ | |
547 | generate_6432_conversion = true; | |
548 | ||
549 | if (remain > 0) | |
550 | { | |
551 | if (!convert_label) | |
552 | convert_label = gen_label_rtx (); | |
553 | ||
554 | /* Compare to zero and branch to convert_label if not zero. */ | |
555 | rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label); | |
556 | if (TARGET_P9_MISC) | |
557 | { | |
558 | /* Generate a compare, and convert with a setb later. */ | |
559 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, | |
560 | tmp_reg_src2); | |
561 | emit_insn (gen_rtx_SET (cond, cmp)); | |
562 | } | |
563 | else | |
564 | /* Generate a subfc. and use the longer | |
565 | sequence for conversion. */ | |
566 | if (TARGET_64BIT) | |
567 | emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2, | |
568 | tmp_reg_src1, cond)); | |
569 | else | |
570 | emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2, | |
571 | tmp_reg_src1, cond)); | |
572 | rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
573 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, | |
574 | cvt_ref, pc_rtx); | |
575 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
576 | JUMP_LABEL(j) = convert_label; | |
577 | LABEL_NUSES (convert_label) += 1; | |
578 | } | |
579 | else | |
580 | { | |
581 | /* Just do the subtract/compare. Since this is the last block | |
582 | the convert code will be generated immediately following. */ | |
583 | if (TARGET_P9_MISC) | |
584 | { | |
585 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, | |
586 | tmp_reg_src2); | |
587 | emit_insn (gen_rtx_SET (cond, cmp)); | |
588 | } | |
589 | else | |
590 | if (TARGET_64BIT) | |
591 | emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2, | |
592 | tmp_reg_src1)); | |
593 | else | |
594 | emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2, | |
595 | tmp_reg_src1)); | |
596 | } | |
597 | } | |
598 | ||
599 | offset += cmp_bytes; | |
600 | bytes -= cmp_bytes; | |
601 | } | |
602 | ||
603 | if (generate_6432_conversion) | |
604 | { | |
605 | if (convert_label) | |
606 | emit_label (convert_label); | |
607 | ||
608 | /* We need to produce DI result from sub, then convert to target SI | |
609 | while maintaining <0 / ==0 / >0 properties. This sequence works: | |
610 | subfc L,A,B | |
611 | subfe H,H,H | |
612 | popcntd L,L | |
613 | rldimi L,H,6,0 | |
614 | ||
615 | This is an alternate one Segher cooked up if somebody | |
616 | wants to expand this for something that doesn't have popcntd: | |
617 | subfc L,a,b | |
618 | subfe H,x,x | |
619 | addic t,L,-1 | |
620 | subfe v,t,L | |
621 | or z,v,H | |
622 | ||
623 | And finally, p9 can just do this: | |
624 | cmpld A,B | |
625 | setb r */ | |
626 | ||
627 | if (TARGET_P9_MISC) | |
628 | { | |
629 | emit_insn (gen_setb_unsigned (target, cond)); | |
630 | } | |
631 | else | |
632 | { | |
633 | if (TARGET_64BIT) | |
634 | { | |
635 | rtx tmp_reg_ca = gen_reg_rtx (DImode); | |
636 | emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); | |
637 | emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2)); | |
638 | emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca)); | |
639 | emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2))); | |
640 | } | |
641 | else | |
642 | { | |
643 | rtx tmp_reg_ca = gen_reg_rtx (SImode); | |
644 | emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); | |
645 | emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2)); | |
646 | emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca)); | |
647 | } | |
648 | } | |
649 | } | |
650 | ||
651 | if (final_label) | |
652 | emit_label (final_label); | |
653 | ||
654 | gcc_assert (bytes == 0); | |
655 | return true; | |
656 | } | |
657 | ||
658 | /* Generate alignment check and branch code to set up for | |
659 | strncmp when we don't have DI alignment. | |
660 | STRNCMP_LABEL is the label to branch if there is a page crossing. | |
661 | SRC is the string pointer to be examined. | |
662 | BYTES is the max number of bytes to compare. */ | |
663 | static void | |
664 | expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes) | |
665 | { | |
666 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label); | |
667 | rtx src_check = copy_addr_to_reg (XEXP (src, 0)); | |
668 | if (GET_MODE (src_check) == SImode) | |
669 | emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff))); | |
670 | else | |
671 | emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff))); | |
672 | rtx cond = gen_reg_rtx (CCmode); | |
673 | emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check, | |
674 | GEN_INT (4096 - bytes))); | |
675 | ||
676 | rtx cmp_rtx = gen_rtx_LT (VOIDmode, cond, const0_rtx); | |
677 | ||
678 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
679 | pc_rtx, lab_ref); | |
680 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
681 | JUMP_LABEL (j) = strncmp_label; | |
682 | LABEL_NUSES (strncmp_label) += 1; | |
683 | } | |
684 | ||
685 | /* Expand a string compare operation with length, and return | |
686 | true if successful. Return false if we should let the | |
687 | compiler generate normal code, probably a strncmp call. | |
688 | ||
689 | OPERANDS[0] is the target (result). | |
690 | OPERANDS[1] is the first source. | |
691 | OPERANDS[2] is the second source. | |
692 | If NO_LENGTH is zero, then: | |
693 | OPERANDS[3] is the length. | |
694 | OPERANDS[4] is the alignment in bytes. | |
695 | If NO_LENGTH is nonzero, then: | |
696 | OPERANDS[3] is the alignment in bytes. */ | |
697 | bool | |
698 | expand_strn_compare (rtx operands[], int no_length) | |
699 | { | |
700 | rtx target = operands[0]; | |
701 | rtx orig_src1 = operands[1]; | |
702 | rtx orig_src2 = operands[2]; | |
703 | rtx bytes_rtx, align_rtx; | |
704 | if (no_length) | |
705 | { | |
706 | bytes_rtx = NULL; | |
707 | align_rtx = operands[3]; | |
708 | } | |
709 | else | |
710 | { | |
711 | bytes_rtx = operands[3]; | |
712 | align_rtx = operands[4]; | |
713 | } | |
714 | unsigned HOST_WIDE_INT cmp_bytes = 0; | |
715 | rtx src1 = orig_src1; | |
716 | rtx src2 = orig_src2; | |
717 | ||
718 | /* If we have a length, it must be constant. This simplifies things | |
719 | a bit as we don't have to generate code to check if we've exceeded | |
720 | the length. Later this could be expanded to handle this case. */ | |
721 | if (!no_length && !CONST_INT_P (bytes_rtx)) | |
722 | return false; | |
723 | ||
724 | /* This must be a fixed size alignment. */ | |
725 | if (!CONST_INT_P (align_rtx)) | |
726 | return false; | |
727 | ||
728 | unsigned int base_align = UINTVAL (align_rtx); | |
729 | int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; | |
730 | int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; | |
731 | ||
732 | /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff. */ | |
733 | if (SLOW_UNALIGNED_ACCESS (word_mode, align1) | |
734 | || SLOW_UNALIGNED_ACCESS (word_mode, align2)) | |
735 | return false; | |
736 | ||
737 | gcc_assert (GET_MODE (target) == SImode); | |
738 | ||
739 | /* If we have an LE target without ldbrx and word_mode is DImode, | |
740 | then we must avoid using word_mode. */ | |
741 | int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX | |
742 | && word_mode == DImode); | |
743 | ||
744 | unsigned int word_mode_size = GET_MODE_SIZE (word_mode); | |
745 | ||
746 | unsigned HOST_WIDE_INT offset = 0; | |
747 | unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */ | |
748 | unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */ | |
749 | if (no_length) | |
750 | /* Use this as a standin to determine the mode to use. */ | |
751 | bytes = rs6000_string_compare_inline_limit * word_mode_size; | |
752 | else | |
753 | bytes = UINTVAL (bytes_rtx); | |
754 | ||
755 | machine_mode load_mode = | |
756 | select_block_compare_mode (offset, bytes, base_align, word_mode_ok); | |
757 | unsigned int load_mode_size = GET_MODE_SIZE (load_mode); | |
758 | compare_length = rs6000_string_compare_inline_limit * load_mode_size; | |
759 | ||
760 | /* If we have equality at the end of the last compare and we have not | |
761 | found the end of the string, we need to call strcmp/strncmp to | |
762 | compare the remainder. */ | |
763 | bool equality_compare_rest = false; | |
764 | ||
765 | if (no_length) | |
766 | { | |
767 | bytes = compare_length; | |
768 | equality_compare_rest = true; | |
769 | } | |
770 | else | |
771 | { | |
772 | if (bytes <= compare_length) | |
773 | compare_length = bytes; | |
774 | else | |
775 | equality_compare_rest = true; | |
776 | } | |
777 | ||
778 | rtx result_reg = gen_reg_rtx (word_mode); | |
779 | rtx final_move_label = gen_label_rtx (); | |
780 | rtx final_label = gen_label_rtx (); | |
781 | rtx begin_compare_label = NULL; | |
782 | ||
783 | if (base_align < 8) | |
784 | { | |
785 | /* Generate code that checks distance to 4k boundary for this case. */ | |
786 | begin_compare_label = gen_label_rtx (); | |
787 | rtx strncmp_label = gen_label_rtx (); | |
788 | rtx jmp; | |
789 | ||
790 | /* Strncmp for power8 in glibc does this: | |
791 | rldicl r8,r3,0,52 | |
792 | cmpldi cr7,r8,4096-16 | |
793 | bgt cr7,L(pagecross) */ | |
794 | ||
795 | /* Make sure that the length we use for the alignment test and | |
796 | the subsequent code generation are in agreement so we do not | |
797 | go past the length we tested for a 4k boundary crossing. */ | |
798 | unsigned HOST_WIDE_INT align_test = compare_length; | |
799 | if (align_test < 8) | |
800 | { | |
801 | align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test); | |
802 | base_align = align_test; | |
803 | } | |
804 | else | |
805 | { | |
806 | align_test = ROUND_UP (align_test, 8); | |
807 | base_align = 8; | |
808 | } | |
809 | ||
810 | if (align1 < 8) | |
811 | expand_strncmp_align_check (strncmp_label, src1, align_test); | |
812 | if (align2 < 8) | |
813 | expand_strncmp_align_check (strncmp_label, src2, align_test); | |
814 | ||
815 | /* Now generate the following sequence: | |
816 | - branch to begin_compare | |
817 | - strncmp_label | |
818 | - call to strncmp | |
819 | - branch to final_label | |
820 | - begin_compare_label */ | |
821 | ||
822 | rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label); | |
823 | jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref)); | |
824 | JUMP_LABEL (jmp) = begin_compare_label; | |
825 | LABEL_NUSES (begin_compare_label) += 1; | |
826 | emit_barrier (); | |
827 | ||
828 | emit_label (strncmp_label); | |
829 | ||
830 | if (!REG_P (XEXP (src1, 0))) | |
831 | { | |
832 | rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
833 | src1 = replace_equiv_address (src1, src1_reg); | |
834 | } | |
835 | ||
836 | if (!REG_P (XEXP (src2, 0))) | |
837 | { | |
838 | rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
839 | src2 = replace_equiv_address (src2, src2_reg); | |
840 | } | |
841 | ||
842 | if (no_length) | |
843 | { | |
844 | tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); | |
845 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
846 | target, LCT_NORMAL, GET_MODE (target), 2, | |
847 | force_reg (Pmode, XEXP (src1, 0)), Pmode, | |
848 | force_reg (Pmode, XEXP (src2, 0)), Pmode); | |
849 | } | |
850 | else | |
851 | { | |
852 | /* -m32 -mpowerpc64 results in word_mode being DImode even | |
853 | though otherwise it is 32-bit. The length arg to strncmp | |
854 | is a size_t which will be the same size as pointers. */ | |
855 | rtx len_rtx; | |
856 | if (TARGET_64BIT) | |
857 | len_rtx = gen_reg_rtx (DImode); | |
858 | else | |
859 | len_rtx = gen_reg_rtx (SImode); | |
860 | ||
861 | emit_move_insn (len_rtx, bytes_rtx); | |
862 | ||
863 | tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); | |
864 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
865 | target, LCT_NORMAL, GET_MODE (target), 3, | |
866 | force_reg (Pmode, XEXP (src1, 0)), Pmode, | |
867 | force_reg (Pmode, XEXP (src2, 0)), Pmode, | |
868 | len_rtx, GET_MODE (len_rtx)); | |
869 | } | |
870 | ||
871 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
872 | jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
873 | JUMP_LABEL (jmp) = final_label; | |
874 | LABEL_NUSES (final_label) += 1; | |
875 | emit_barrier (); | |
876 | emit_label (begin_compare_label); | |
877 | } | |
878 | ||
879 | rtx cleanup_label = NULL; | |
880 | rtx tmp_reg_src1 = gen_reg_rtx (word_mode); | |
881 | rtx tmp_reg_src2 = gen_reg_rtx (word_mode); | |
882 | ||
883 | /* Generate sequence of ld/ldbrx, cmpb to compare out | |
884 | to the length specified. */ | |
885 | unsigned HOST_WIDE_INT bytes_to_compare = compare_length; | |
886 | while (bytes_to_compare > 0) | |
887 | { | |
888 | /* Compare sequence: | |
889 | check each 8B with: ld/ld cmpd bne | |
890 | If equal, use rldicr/cmpb to check for zero byte. | |
891 | cleanup code at end: | |
892 | cmpb get byte that differs | |
893 | cmpb look for zero byte | |
894 | orc combine | |
895 | cntlzd get bit of first zero/diff byte | |
896 | subfic convert for rldcl use | |
897 | rldcl rldcl extract diff/zero byte | |
898 | subf subtract for final result | |
899 | ||
900 | The last compare can branch around the cleanup code if the | |
901 | result is zero because the strings are exactly equal. */ | |
902 | unsigned int align = compute_current_alignment (base_align, offset); | |
903 | if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
904 | load_mode = select_block_compare_mode (offset, bytes_to_compare, align, | |
905 | word_mode_ok); | |
906 | else | |
907 | load_mode = select_block_compare_mode (0, bytes_to_compare, align, | |
908 | word_mode_ok); | |
909 | load_mode_size = GET_MODE_SIZE (load_mode); | |
910 | if (bytes_to_compare >= load_mode_size) | |
911 | cmp_bytes = load_mode_size; | |
912 | else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
913 | { | |
914 | /* Move this load back so it doesn't go past the end. | |
915 | P8/P9 can do this efficiently. */ | |
916 | unsigned int extra_bytes = load_mode_size - bytes_to_compare; | |
917 | cmp_bytes = bytes_to_compare; | |
918 | if (extra_bytes < offset) | |
919 | { | |
920 | offset -= extra_bytes; | |
921 | cmp_bytes = load_mode_size; | |
922 | bytes_to_compare = cmp_bytes; | |
923 | } | |
924 | } | |
925 | else | |
926 | /* P7 and earlier can't do the overlapping load trick fast, | |
927 | so this forces a non-overlapping load and a shift to get | |
928 | rid of the extra bytes. */ | |
929 | cmp_bytes = bytes_to_compare; | |
930 | ||
931 | src1 = adjust_address (orig_src1, load_mode, offset); | |
932 | src2 = adjust_address (orig_src2, load_mode, offset); | |
933 | ||
934 | if (!REG_P (XEXP (src1, 0))) | |
935 | { | |
936 | rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
937 | src1 = replace_equiv_address (src1, src1_reg); | |
938 | } | |
939 | set_mem_size (src1, cmp_bytes); | |
940 | ||
941 | if (!REG_P (XEXP (src2, 0))) | |
942 | { | |
943 | rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
944 | src2 = replace_equiv_address (src2, src2_reg); | |
945 | } | |
946 | set_mem_size (src2, cmp_bytes); | |
947 | ||
948 | do_load_for_compare (tmp_reg_src1, src1, load_mode); | |
949 | do_load_for_compare (tmp_reg_src2, src2, load_mode); | |
950 | ||
951 | /* We must always left-align the data we read, and | |
952 | clear any bytes to the right that are beyond the string. | |
953 | Otherwise the cmpb sequence won't produce the correct | |
954 | results. The beginning of the compare will be done | |
955 | with word_mode so will not have any extra shifts or | |
956 | clear rights. */ | |
957 | ||
958 | if (load_mode_size < word_mode_size) | |
959 | { | |
960 | /* Rotate left first. */ | |
961 | rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size)); | |
962 | if (word_mode == DImode) | |
963 | { | |
964 | emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
965 | emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
966 | } | |
967 | else | |
968 | { | |
969 | emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
970 | emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
971 | } | |
972 | } | |
973 | ||
974 | if (cmp_bytes < word_mode_size) | |
975 | { | |
976 | /* Now clear right. This plus the rotate can be | |
977 | turned into a rldicr instruction. */ | |
978 | HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes); | |
979 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); | |
980 | if (word_mode == DImode) | |
981 | { | |
982 | emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask)); | |
983 | emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask)); | |
984 | } | |
985 | else | |
986 | { | |
987 | emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask)); | |
988 | emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask)); | |
989 | } | |
990 | } | |
991 | ||
992 | /* Cases to handle. A and B are chunks of the two strings. | |
993 | 1: Not end of comparison: | |
994 | A != B: branch to cleanup code to compute result. | |
995 | A == B: check for 0 byte, next block if not found. | |
996 | 2: End of the inline comparison: | |
997 | A != B: branch to cleanup code to compute result. | |
998 | A == B: check for 0 byte, call strcmp/strncmp | |
999 | 3: compared requested N bytes: | |
1000 | A == B: branch to result 0. | |
1001 | A != B: cleanup code to compute result. */ | |
1002 | ||
1003 | unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; | |
1004 | ||
1005 | rtx dst_label; | |
1006 | if (remain > 0 || equality_compare_rest) | |
1007 | { | |
1008 | /* Branch to cleanup code, otherwise fall through to do | |
1009 | more compares. */ | |
1010 | if (!cleanup_label) | |
1011 | cleanup_label = gen_label_rtx (); | |
1012 | dst_label = cleanup_label; | |
1013 | } | |
1014 | else | |
1015 | /* Branch to end and produce result of 0. */ | |
1016 | dst_label = final_move_label; | |
1017 | ||
1018 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); | |
1019 | rtx cond = gen_reg_rtx (CCmode); | |
1020 | ||
1021 | /* Always produce the 0 result, it is needed if | |
1022 | cmpb finds a 0 byte in this chunk. */ | |
1023 | rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); | |
1024 | rs6000_emit_dot_insn (result_reg, tmp, 1, cond); | |
1025 | ||
1026 | rtx cmp_rtx; | |
1027 | if (remain == 0 && !equality_compare_rest) | |
1028 | cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx); | |
1029 | else | |
1030 | cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
1031 | ||
1032 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
1033 | lab_ref, pc_rtx); | |
1034 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
1035 | JUMP_LABEL (j) = dst_label; | |
1036 | LABEL_NUSES (dst_label) += 1; | |
1037 | ||
1038 | if (remain > 0 || equality_compare_rest) | |
1039 | { | |
1040 | /* Generate a cmpb to test for a 0 byte and branch | |
1041 | to final result if found. */ | |
1042 | rtx cmpb_zero = gen_reg_rtx (word_mode); | |
1043 | rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label); | |
1044 | rtx condz = gen_reg_rtx (CCmode); | |
1045 | rtx zero_reg = gen_reg_rtx (word_mode); | |
1046 | if (word_mode == SImode) | |
1047 | { | |
1048 | emit_insn (gen_movsi (zero_reg, GEN_INT (0))); | |
1049 | emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg)); | |
1050 | if (cmp_bytes < word_mode_size) | |
1051 | { | |
1052 | /* Don't want to look at zero bytes past end. */ | |
1053 | HOST_WIDE_INT mb = | |
1054 | BITS_PER_UNIT * (word_mode_size - cmp_bytes); | |
1055 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); | |
1056 | emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask)); | |
1057 | } | |
1058 | } | |
1059 | else | |
1060 | { | |
1061 | emit_insn (gen_movdi (zero_reg, GEN_INT (0))); | |
1062 | emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg)); | |
1063 | if (cmp_bytes < word_mode_size) | |
1064 | { | |
1065 | /* Don't want to look at zero bytes past end. */ | |
1066 | HOST_WIDE_INT mb = | |
1067 | BITS_PER_UNIT * (word_mode_size - cmp_bytes); | |
1068 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); | |
1069 | emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask)); | |
1070 | } | |
1071 | } | |
1072 | ||
1073 | emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg)); | |
1074 | rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx); | |
1075 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx, | |
1076 | lab_ref_fin, pc_rtx); | |
1077 | rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
1078 | JUMP_LABEL (j2) = final_move_label; | |
1079 | LABEL_NUSES (final_move_label) += 1; | |
1080 | ||
1081 | } | |
1082 | ||
1083 | offset += cmp_bytes; | |
1084 | bytes_to_compare -= cmp_bytes; | |
1085 | } | |
1086 | ||
1087 | if (equality_compare_rest) | |
1088 | { | |
1089 | /* Update pointers past what has been compared already. */ | |
1090 | src1 = adjust_address (orig_src1, load_mode, offset); | |
1091 | src2 = adjust_address (orig_src2, load_mode, offset); | |
1092 | ||
1093 | if (!REG_P (XEXP (src1, 0))) | |
1094 | { | |
1095 | rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
1096 | src1 = replace_equiv_address (src1, src1_reg); | |
1097 | } | |
1098 | set_mem_size (src1, cmp_bytes); | |
1099 | ||
1100 | if (!REG_P (XEXP (src2, 0))) | |
1101 | { | |
1102 | rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
1103 | src2 = replace_equiv_address (src2, src2_reg); | |
1104 | } | |
1105 | set_mem_size (src2, cmp_bytes); | |
1106 | ||
1107 | /* Construct call to strcmp/strncmp to compare the rest of the string. */ | |
1108 | if (no_length) | |
1109 | { | |
1110 | tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); | |
1111 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
1112 | target, LCT_NORMAL, GET_MODE (target), 2, | |
1113 | force_reg (Pmode, XEXP (src1, 0)), Pmode, | |
1114 | force_reg (Pmode, XEXP (src2, 0)), Pmode); | |
1115 | } | |
1116 | else | |
1117 | { | |
1118 | rtx len_rtx; | |
1119 | if (TARGET_64BIT) | |
1120 | len_rtx = gen_reg_rtx (DImode); | |
1121 | else | |
1122 | len_rtx = gen_reg_rtx (SImode); | |
1123 | ||
1124 | emit_move_insn (len_rtx, GEN_INT (bytes - compare_length)); | |
1125 | tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); | |
1126 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
1127 | target, LCT_NORMAL, GET_MODE (target), 3, | |
1128 | force_reg (Pmode, XEXP (src1, 0)), Pmode, | |
1129 | force_reg (Pmode, XEXP (src2, 0)), Pmode, | |
1130 | len_rtx, GET_MODE (len_rtx)); | |
1131 | } | |
1132 | ||
1133 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1134 | rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
1135 | JUMP_LABEL (jmp) = final_label; | |
1136 | LABEL_NUSES (final_label) += 1; | |
1137 | emit_barrier (); | |
1138 | } | |
1139 | ||
1140 | if (cleanup_label) | |
1141 | emit_label (cleanup_label); | |
1142 | ||
1143 | /* Generate the final sequence that identifies the differing | |
1144 | byte and generates the final result, taking into account | |
1145 | zero bytes: | |
1146 | ||
1147 | cmpb cmpb_result1, src1, src2 | |
1148 | cmpb cmpb_result2, src1, zero | |
1149 | orc cmpb_result1, cmp_result1, cmpb_result2 | |
1150 | cntlzd get bit of first zero/diff byte | |
1151 | addi convert for rldcl use | |
1152 | rldcl rldcl extract diff/zero byte | |
1153 | subf subtract for final result | |
1154 | */ | |
1155 | ||
1156 | rtx cmpb_diff = gen_reg_rtx (word_mode); | |
1157 | rtx cmpb_zero = gen_reg_rtx (word_mode); | |
1158 | rtx rot_amt = gen_reg_rtx (word_mode); | |
1159 | rtx zero_reg = gen_reg_rtx (word_mode); | |
1160 | ||
1161 | rtx rot1_1 = gen_reg_rtx (word_mode); | |
1162 | rtx rot1_2 = gen_reg_rtx (word_mode); | |
1163 | rtx rot2_1 = gen_reg_rtx (word_mode); | |
1164 | rtx rot2_2 = gen_reg_rtx (word_mode); | |
1165 | ||
1166 | if (word_mode == SImode) | |
1167 | { | |
1168 | emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2)); | |
1169 | emit_insn (gen_movsi (zero_reg, GEN_INT (0))); | |
1170 | emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg)); | |
1171 | emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff)); | |
1172 | emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero)); | |
1173 | emit_insn (gen_clzsi2 (rot_amt, cmpb_diff)); | |
1174 | emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8))); | |
1175 | emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1, | |
1176 | gen_lowpart (SImode, rot_amt))); | |
1177 | emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); | |
1178 | emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2, | |
1179 | gen_lowpart (SImode, rot_amt))); | |
1180 | emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); | |
1181 | emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2)); | |
1182 | } | |
1183 | else | |
1184 | { | |
1185 | emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2)); | |
1186 | emit_insn (gen_movdi (zero_reg, GEN_INT (0))); | |
1187 | emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg)); | |
1188 | emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff)); | |
1189 | emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero)); | |
1190 | emit_insn (gen_clzdi2 (rot_amt, cmpb_diff)); | |
1191 | emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8))); | |
1192 | emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1, | |
1193 | gen_lowpart (SImode, rot_amt))); | |
1194 | emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); | |
1195 | emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2, | |
1196 | gen_lowpart (SImode, rot_amt))); | |
1197 | emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); | |
1198 | emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2)); | |
1199 | } | |
1200 | ||
1201 | emit_label (final_move_label); | |
1202 | emit_insn (gen_movsi (target, | |
1203 | gen_lowpart (SImode, result_reg))); | |
1204 | emit_label (final_label); | |
1205 | return true; | |
1206 | } | |
1207 | ||
1208 | /* Expand a block move operation, and return 1 if successful. Return 0 | |
1209 | if we should let the compiler generate normal code. | |
1210 | ||
1211 | operands[0] is the destination | |
1212 | operands[1] is the source | |
1213 | operands[2] is the length | |
1214 | operands[3] is the alignment */ | |
1215 | ||
1216 | #define MAX_MOVE_REG 4 | |
1217 | ||
1218 | int | |
1219 | expand_block_move (rtx operands[]) | |
1220 | { | |
1221 | rtx orig_dest = operands[0]; | |
1222 | rtx orig_src = operands[1]; | |
1223 | rtx bytes_rtx = operands[2]; | |
1224 | rtx align_rtx = operands[3]; | |
1225 | int constp = (GET_CODE (bytes_rtx) == CONST_INT); | |
1226 | int align; | |
1227 | int bytes; | |
1228 | int offset; | |
1229 | int move_bytes; | |
1230 | rtx stores[MAX_MOVE_REG]; | |
1231 | int num_reg = 0; | |
1232 | ||
1233 | /* If this is not a fixed size move, just call memcpy */ | |
1234 | if (! constp) | |
1235 | return 0; | |
1236 | ||
1237 | /* This must be a fixed size alignment */ | |
1238 | gcc_assert (GET_CODE (align_rtx) == CONST_INT); | |
1239 | align = INTVAL (align_rtx) * BITS_PER_UNIT; | |
1240 | ||
1241 | /* Anything to move? */ | |
1242 | bytes = INTVAL (bytes_rtx); | |
1243 | if (bytes <= 0) | |
1244 | return 1; | |
1245 | ||
1246 | if (bytes > rs6000_block_move_inline_limit) | |
1247 | return 0; | |
1248 | ||
1249 | for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes) | |
1250 | { | |
1251 | union { | |
1252 | rtx (*movmemsi) (rtx, rtx, rtx, rtx); | |
1253 | rtx (*mov) (rtx, rtx); | |
1254 | } gen_func; | |
1255 | machine_mode mode = BLKmode; | |
1256 | rtx src, dest; | |
1257 | ||
1258 | /* Altivec first, since it will be faster than a string move | |
1259 | when it applies, and usually not significantly larger. */ | |
1260 | if (TARGET_ALTIVEC && bytes >= 16 && align >= 128) | |
1261 | { | |
1262 | move_bytes = 16; | |
1263 | mode = V4SImode; | |
1264 | gen_func.mov = gen_movv4si; | |
1265 | } | |
1266 | else if (TARGET_STRING | |
1267 | && bytes > 24 /* move up to 32 bytes at a time */ | |
1268 | && ! fixed_regs[5] | |
1269 | && ! fixed_regs[6] | |
1270 | && ! fixed_regs[7] | |
1271 | && ! fixed_regs[8] | |
1272 | && ! fixed_regs[9] | |
1273 | && ! fixed_regs[10] | |
1274 | && ! fixed_regs[11] | |
1275 | && ! fixed_regs[12]) | |
1276 | { | |
1277 | move_bytes = (bytes > 32) ? 32 : bytes; | |
1278 | gen_func.movmemsi = gen_movmemsi_8reg; | |
1279 | } | |
1280 | else if (TARGET_STRING | |
1281 | && bytes > 16 /* move up to 24 bytes at a time */ | |
1282 | && ! fixed_regs[5] | |
1283 | && ! fixed_regs[6] | |
1284 | && ! fixed_regs[7] | |
1285 | && ! fixed_regs[8] | |
1286 | && ! fixed_regs[9] | |
1287 | && ! fixed_regs[10]) | |
1288 | { | |
1289 | move_bytes = (bytes > 24) ? 24 : bytes; | |
1290 | gen_func.movmemsi = gen_movmemsi_6reg; | |
1291 | } | |
1292 | else if (TARGET_STRING | |
1293 | && bytes > 8 /* move up to 16 bytes at a time */ | |
1294 | && ! fixed_regs[5] | |
1295 | && ! fixed_regs[6] | |
1296 | && ! fixed_regs[7] | |
1297 | && ! fixed_regs[8]) | |
1298 | { | |
1299 | move_bytes = (bytes > 16) ? 16 : bytes; | |
1300 | gen_func.movmemsi = gen_movmemsi_4reg; | |
1301 | } | |
1302 | else if (bytes >= 8 && TARGET_POWERPC64 | |
1303 | && (align >= 64 || !STRICT_ALIGNMENT)) | |
1304 | { | |
1305 | move_bytes = 8; | |
1306 | mode = DImode; | |
1307 | gen_func.mov = gen_movdi; | |
1308 | if (offset == 0 && align < 64) | |
1309 | { | |
1310 | rtx addr; | |
1311 | ||
1312 | /* If the address form is reg+offset with offset not a | |
1313 | multiple of four, reload into reg indirect form here | |
1314 | rather than waiting for reload. This way we get one | |
1315 | reload, not one per load and/or store. */ | |
1316 | addr = XEXP (orig_dest, 0); | |
1317 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
1318 | && GET_CODE (XEXP (addr, 1)) == CONST_INT | |
1319 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) | |
1320 | { | |
1321 | addr = copy_addr_to_reg (addr); | |
1322 | orig_dest = replace_equiv_address (orig_dest, addr); | |
1323 | } | |
1324 | addr = XEXP (orig_src, 0); | |
1325 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
1326 | && GET_CODE (XEXP (addr, 1)) == CONST_INT | |
1327 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) | |
1328 | { | |
1329 | addr = copy_addr_to_reg (addr); | |
1330 | orig_src = replace_equiv_address (orig_src, addr); | |
1331 | } | |
1332 | } | |
1333 | } | |
1334 | else if (TARGET_STRING && bytes > 4 && !TARGET_POWERPC64) | |
1335 | { /* move up to 8 bytes at a time */ | |
1336 | move_bytes = (bytes > 8) ? 8 : bytes; | |
1337 | gen_func.movmemsi = gen_movmemsi_2reg; | |
1338 | } | |
1339 | else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) | |
1340 | { /* move 4 bytes */ | |
1341 | move_bytes = 4; | |
1342 | mode = SImode; | |
1343 | gen_func.mov = gen_movsi; | |
1344 | } | |
1345 | else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) | |
1346 | { /* move 2 bytes */ | |
1347 | move_bytes = 2; | |
1348 | mode = HImode; | |
1349 | gen_func.mov = gen_movhi; | |
1350 | } | |
1351 | else if (TARGET_STRING && bytes > 1) | |
1352 | { /* move up to 4 bytes at a time */ | |
1353 | move_bytes = (bytes > 4) ? 4 : bytes; | |
1354 | gen_func.movmemsi = gen_movmemsi_1reg; | |
1355 | } | |
1356 | else /* move 1 byte at a time */ | |
1357 | { | |
1358 | move_bytes = 1; | |
1359 | mode = QImode; | |
1360 | gen_func.mov = gen_movqi; | |
1361 | } | |
1362 | ||
1363 | src = adjust_address (orig_src, mode, offset); | |
1364 | dest = adjust_address (orig_dest, mode, offset); | |
1365 | ||
1366 | if (mode != BLKmode) | |
1367 | { | |
1368 | rtx tmp_reg = gen_reg_rtx (mode); | |
1369 | ||
1370 | emit_insn ((*gen_func.mov) (tmp_reg, src)); | |
1371 | stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); | |
1372 | } | |
1373 | ||
1374 | if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes) | |
1375 | { | |
1376 | int i; | |
1377 | for (i = 0; i < num_reg; i++) | |
1378 | emit_insn (stores[i]); | |
1379 | num_reg = 0; | |
1380 | } | |
1381 | ||
1382 | if (mode == BLKmode) | |
1383 | { | |
1384 | /* Move the address into scratch registers. The movmemsi | |
1385 | patterns require zero offset. */ | |
1386 | if (!REG_P (XEXP (src, 0))) | |
1387 | { | |
1388 | rtx src_reg = copy_addr_to_reg (XEXP (src, 0)); | |
1389 | src = replace_equiv_address (src, src_reg); | |
1390 | } | |
1391 | set_mem_size (src, move_bytes); | |
1392 | ||
1393 | if (!REG_P (XEXP (dest, 0))) | |
1394 | { | |
1395 | rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0)); | |
1396 | dest = replace_equiv_address (dest, dest_reg); | |
1397 | } | |
1398 | set_mem_size (dest, move_bytes); | |
1399 | ||
1400 | emit_insn ((*gen_func.movmemsi) (dest, src, | |
1401 | GEN_INT (move_bytes & 31), | |
1402 | align_rtx)); | |
1403 | } | |
1404 | } | |
1405 | ||
1406 | return 1; | |
1407 | } | |
1408 | ||
1409 | \f | |
1410 | /* Return a string to perform a load_multiple operation. | |
1411 | operands[0] is the vector. | |
1412 | operands[1] is the source address. | |
1413 | operands[2] is the first destination register. */ | |
1414 | ||
1415 | const char * | |
1416 | rs6000_output_load_multiple (rtx operands[3]) | |
1417 | { | |
1418 | /* We have to handle the case where the pseudo used to contain the address | |
1419 | is assigned to one of the output registers. */ | |
1420 | int i, j; | |
1421 | int words = XVECLEN (operands[0], 0); | |
1422 | rtx xop[10]; | |
1423 | ||
1424 | if (XVECLEN (operands[0], 0) == 1) | |
1425 | return "lwz %2,0(%1)"; | |
1426 | ||
1427 | for (i = 0; i < words; i++) | |
1428 | if (refers_to_regno_p (REGNO (operands[2]) + i, operands[1])) | |
1429 | { | |
1430 | if (i == words-1) | |
1431 | { | |
1432 | xop[0] = GEN_INT (4 * (words-1)); | |
1433 | xop[1] = operands[1]; | |
1434 | xop[2] = operands[2]; | |
1435 | output_asm_insn ("lswi %2,%1,%0\n\tlwz %1,%0(%1)", xop); | |
1436 | return ""; | |
1437 | } | |
1438 | else if (i == 0) | |
1439 | { | |
1440 | xop[0] = GEN_INT (4 * (words-1)); | |
1441 | xop[1] = operands[1]; | |
1442 | xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + 1); | |
1443 | output_asm_insn ("addi %1,%1,4\n\tlswi %2,%1,%0\n\tlwz %1,-4(%1)", xop); | |
1444 | return ""; | |
1445 | } | |
1446 | else | |
1447 | { | |
1448 | for (j = 0; j < words; j++) | |
1449 | if (j != i) | |
1450 | { | |
1451 | xop[0] = GEN_INT (j * 4); | |
1452 | xop[1] = operands[1]; | |
1453 | xop[2] = gen_rtx_REG (SImode, REGNO (operands[2]) + j); | |
1454 | output_asm_insn ("lwz %2,%0(%1)", xop); | |
1455 | } | |
1456 | xop[0] = GEN_INT (i * 4); | |
1457 | xop[1] = operands[1]; | |
1458 | output_asm_insn ("lwz %1,%0(%1)", xop); | |
1459 | return ""; | |
1460 | } | |
1461 | } | |
1462 | ||
1463 | return "lswi %2,%1,%N0"; | |
1464 | } | |
1465 |