]>
Commit | Line | Data |
---|---|---|
8845cb37 AS |
1 | /* Subroutines used to expand string and block move, clear, |
2 | compare and other operations for PowerPC. | |
a5544970 | 3 | Copyright (C) 1991-2019 Free Software Foundation, Inc. |
8845cb37 AS |
4 | |
5 | This file is part of GCC. | |
6 | ||
7 | GCC is free software; you can redistribute it and/or modify it | |
8 | under the terms of the GNU General Public License as published | |
9 | by the Free Software Foundation; either version 3, or (at your | |
10 | option) any later version. | |
11 | ||
12 | GCC is distributed in the hope that it will be useful, but WITHOUT | |
13 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
14 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | |
15 | License for more details. | |
16 | ||
17 | You should have received a copy of the GNU General Public License | |
18 | along with GCC; see the file COPYING3. If not see | |
19 | <http://www.gnu.org/licenses/>. */ | |
20 | ||
8fcc61f8 RS |
21 | #define IN_TARGET_CODE 1 |
22 | ||
8845cb37 AS |
23 | #include "config.h" |
24 | #include "system.h" | |
25 | #include "coretypes.h" | |
26 | #include "backend.h" | |
27 | #include "rtl.h" | |
28 | #include "tree.h" | |
29 | #include "memmodel.h" | |
30 | #include "tm_p.h" | |
31 | #include "ira.h" | |
32 | #include "print-tree.h" | |
33 | #include "varasm.h" | |
34 | #include "explow.h" | |
35 | #include "expr.h" | |
36 | #include "output.h" | |
e0bd6c9f | 37 | #include "target.h" |
8845cb37 AS |
38 | |
39 | /* Expand a block clear operation, and return 1 if successful. Return 0 | |
40 | if we should let the compiler generate normal code. | |
41 | ||
42 | operands[0] is the destination | |
43 | operands[1] is the length | |
44 | operands[3] is the alignment */ | |
45 | ||
46 | int | |
47 | expand_block_clear (rtx operands[]) | |
48 | { | |
49 | rtx orig_dest = operands[0]; | |
50 | rtx bytes_rtx = operands[1]; | |
51 | rtx align_rtx = operands[3]; | |
2e42a52f | 52 | bool constp = CONST_INT_P (bytes_rtx); |
8845cb37 AS |
53 | HOST_WIDE_INT align; |
54 | HOST_WIDE_INT bytes; | |
55 | int offset; | |
56 | int clear_bytes; | |
57 | int clear_step; | |
58 | ||
59 | /* If this is not a fixed size move, just call memcpy */ | |
60 | if (! constp) | |
61 | return 0; | |
62 | ||
63 | /* This must be a fixed size alignment */ | |
2e42a52f | 64 | gcc_assert (CONST_INT_P (align_rtx)); |
8845cb37 AS |
65 | align = INTVAL (align_rtx) * BITS_PER_UNIT; |
66 | ||
67 | /* Anything to clear? */ | |
68 | bytes = INTVAL (bytes_rtx); | |
69 | if (bytes <= 0) | |
70 | return 1; | |
71 | ||
72 | /* Use the builtin memset after a point, to avoid huge code bloat. | |
73 | When optimize_size, avoid any significant code bloat; calling | |
74 | memset is about 4 instructions, so allow for one instruction to | |
75 | load zero and three to do clearing. */ | |
3b0cb1a5 | 76 | if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX)) |
8845cb37 AS |
77 | clear_step = 16; |
78 | else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT)) | |
79 | clear_step = 8; | |
80 | else | |
81 | clear_step = 4; | |
82 | ||
83 | if (optimize_size && bytes > 3 * clear_step) | |
84 | return 0; | |
85 | if (! optimize_size && bytes > 8 * clear_step) | |
86 | return 0; | |
87 | ||
645eee74 AS |
88 | bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX); |
89 | ||
8845cb37 AS |
90 | for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes) |
91 | { | |
92 | machine_mode mode = BLKmode; | |
93 | rtx dest; | |
94 | ||
31369f5a | 95 | if (TARGET_ALTIVEC |
645eee74 | 96 | && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok))) |
8845cb37 AS |
97 | { |
98 | clear_bytes = 16; | |
99 | mode = V4SImode; | |
100 | } | |
101 | else if (bytes >= 8 && TARGET_POWERPC64 | |
102 | && (align >= 64 || !STRICT_ALIGNMENT)) | |
103 | { | |
104 | clear_bytes = 8; | |
105 | mode = DImode; | |
106 | if (offset == 0 && align < 64) | |
107 | { | |
108 | rtx addr; | |
109 | ||
110 | /* If the address form is reg+offset with offset not a | |
111 | multiple of four, reload into reg indirect form here | |
112 | rather than waiting for reload. This way we get one | |
113 | reload, not one per store. */ | |
114 | addr = XEXP (orig_dest, 0); | |
115 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
2e42a52f | 116 | && CONST_INT_P (XEXP (addr, 1)) |
8845cb37 AS |
117 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) |
118 | { | |
119 | addr = copy_addr_to_reg (addr); | |
120 | orig_dest = replace_equiv_address (orig_dest, addr); | |
121 | } | |
122 | } | |
123 | } | |
124 | else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) | |
125 | { /* move 4 bytes */ | |
126 | clear_bytes = 4; | |
127 | mode = SImode; | |
128 | } | |
129 | else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) | |
130 | { /* move 2 bytes */ | |
131 | clear_bytes = 2; | |
132 | mode = HImode; | |
133 | } | |
134 | else /* move 1 byte at a time */ | |
135 | { | |
136 | clear_bytes = 1; | |
137 | mode = QImode; | |
138 | } | |
139 | ||
140 | dest = adjust_address (orig_dest, mode, offset); | |
141 | ||
142 | emit_move_insn (dest, CONST0_RTX (mode)); | |
143 | } | |
144 | ||
145 | return 1; | |
146 | } | |
147 | ||
148 | /* Figure out the correct instructions to generate to load data for | |
149 | block compare. MODE is used for the read from memory, and | |
150 | data is zero extended if REG is wider than MODE. If LE code | |
151 | is being generated, bswap loads are used. | |
152 | ||
153 | REG is the destination register to move the data into. | |
154 | MEM is the memory block being read. | |
155 | MODE is the mode of memory to use for the read. */ | |
156 | static void | |
157 | do_load_for_compare (rtx reg, rtx mem, machine_mode mode) | |
158 | { | |
159 | switch (GET_MODE (reg)) | |
160 | { | |
9d36bd3b AS |
161 | case E_V16QImode: |
162 | switch (mode) | |
163 | { | |
164 | case E_V16QImode: | |
165 | if (!BYTES_BIG_ENDIAN) | |
166 | { | |
167 | if (TARGET_P9_VECTOR) | |
168 | emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem)); | |
169 | else | |
170 | { | |
171 | rtx reg_v2di = simplify_gen_subreg (V2DImode, reg, | |
172 | V16QImode, 0); | |
173 | gcc_assert (MEM_P (mem)); | |
174 | rtx addr = XEXP (mem, 0); | |
175 | rtx mem_v2di = gen_rtx_MEM (V2DImode, addr); | |
176 | MEM_COPY_ATTRIBUTES (mem_v2di, mem); | |
177 | set_mem_size (mem, GET_MODE_SIZE (V2DImode)); | |
178 | emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di)); | |
179 | } | |
180 | } | |
181 | else | |
182 | emit_insn (gen_vsx_movv2di_64bit (reg, mem)); | |
183 | break; | |
184 | default: | |
185 | gcc_unreachable (); | |
186 | } | |
187 | break; | |
4e10a5a7 | 188 | case E_DImode: |
8845cb37 AS |
189 | switch (mode) |
190 | { | |
4e10a5a7 | 191 | case E_QImode: |
8845cb37 AS |
192 | emit_insn (gen_zero_extendqidi2 (reg, mem)); |
193 | break; | |
4e10a5a7 | 194 | case E_HImode: |
8845cb37 AS |
195 | { |
196 | rtx src = mem; | |
197 | if (!BYTES_BIG_ENDIAN) | |
198 | { | |
199 | src = gen_reg_rtx (HImode); | |
200 | emit_insn (gen_bswaphi2 (src, mem)); | |
201 | } | |
202 | emit_insn (gen_zero_extendhidi2 (reg, src)); | |
203 | break; | |
204 | } | |
4e10a5a7 | 205 | case E_SImode: |
8845cb37 AS |
206 | { |
207 | rtx src = mem; | |
208 | if (!BYTES_BIG_ENDIAN) | |
209 | { | |
210 | src = gen_reg_rtx (SImode); | |
211 | emit_insn (gen_bswapsi2 (src, mem)); | |
212 | } | |
213 | emit_insn (gen_zero_extendsidi2 (reg, src)); | |
214 | } | |
215 | break; | |
4e10a5a7 | 216 | case E_DImode: |
8845cb37 AS |
217 | if (!BYTES_BIG_ENDIAN) |
218 | emit_insn (gen_bswapdi2 (reg, mem)); | |
219 | else | |
220 | emit_insn (gen_movdi (reg, mem)); | |
221 | break; | |
222 | default: | |
223 | gcc_unreachable (); | |
224 | } | |
225 | break; | |
226 | ||
4e10a5a7 | 227 | case E_SImode: |
8845cb37 AS |
228 | switch (mode) |
229 | { | |
4e10a5a7 | 230 | case E_QImode: |
8845cb37 AS |
231 | emit_insn (gen_zero_extendqisi2 (reg, mem)); |
232 | break; | |
4e10a5a7 | 233 | case E_HImode: |
8845cb37 AS |
234 | { |
235 | rtx src = mem; | |
236 | if (!BYTES_BIG_ENDIAN) | |
237 | { | |
238 | src = gen_reg_rtx (HImode); | |
239 | emit_insn (gen_bswaphi2 (src, mem)); | |
240 | } | |
241 | emit_insn (gen_zero_extendhisi2 (reg, src)); | |
242 | break; | |
243 | } | |
4e10a5a7 | 244 | case E_SImode: |
8845cb37 AS |
245 | if (!BYTES_BIG_ENDIAN) |
246 | emit_insn (gen_bswapsi2 (reg, mem)); | |
247 | else | |
248 | emit_insn (gen_movsi (reg, mem)); | |
249 | break; | |
4e10a5a7 | 250 | case E_DImode: |
8845cb37 AS |
251 | /* DImode is larger than the destination reg so is not expected. */ |
252 | gcc_unreachable (); | |
253 | break; | |
254 | default: | |
255 | gcc_unreachable (); | |
256 | } | |
257 | break; | |
9d36bd3b AS |
258 | |
259 | case E_QImode: | |
260 | gcc_assert (mode == E_QImode); | |
261 | emit_move_insn (reg, mem); | |
262 | break; | |
ef4adf1f | 263 | |
8845cb37 AS |
264 | default: |
265 | gcc_unreachable (); | |
266 | break; | |
267 | } | |
268 | } | |
269 | ||
270 | /* Select the mode to be used for reading the next chunk of bytes | |
271 | in the compare. | |
272 | ||
273 | OFFSET is the current read offset from the beginning of the block. | |
274 | BYTES is the number of bytes remaining to be read. | |
74f9986e | 275 | ALIGN is the minimum alignment of the memory blocks being compared in bytes. */ |
8845cb37 AS |
276 | static machine_mode |
277 | select_block_compare_mode (unsigned HOST_WIDE_INT offset, | |
278 | unsigned HOST_WIDE_INT bytes, | |
74f9986e | 279 | unsigned HOST_WIDE_INT align) |
8845cb37 AS |
280 | { |
281 | /* First see if we can do a whole load unit | |
282 | as that will be more efficient than a larger load + shift. */ | |
283 | ||
284 | /* If big, use biggest chunk. | |
285 | If exactly chunk size, use that size. | |
286 | If remainder can be done in one piece with shifting, do that. | |
287 | Do largest chunk possible without violating alignment rules. */ | |
288 | ||
289 | /* The most we can read without potential page crossing. */ | |
290 | unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align); | |
291 | ||
74f9986e AS |
292 | /* If we have an LE target without ldbrx and word_mode is DImode, |
293 | then we must avoid using word_mode. */ | |
294 | int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX | |
295 | && word_mode == DImode); | |
296 | ||
8845cb37 AS |
297 | if (word_mode_ok && bytes >= UNITS_PER_WORD) |
298 | return word_mode; | |
299 | else if (bytes == GET_MODE_SIZE (SImode)) | |
300 | return SImode; | |
301 | else if (bytes == GET_MODE_SIZE (HImode)) | |
302 | return HImode; | |
303 | else if (bytes == GET_MODE_SIZE (QImode)) | |
304 | return QImode; | |
305 | else if (bytes < GET_MODE_SIZE (SImode) | |
f7e94dfb | 306 | && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED |
8845cb37 AS |
307 | && offset >= GET_MODE_SIZE (SImode) - bytes) |
308 | /* This matches the case were we have SImode and 3 bytes | |
309 | and offset >= 1 and permits us to move back one and overlap | |
310 | with the previous read, thus avoiding having to shift | |
311 | unwanted bytes off of the input. */ | |
312 | return SImode; | |
313 | else if (word_mode_ok && bytes < UNITS_PER_WORD | |
f7e94dfb | 314 | && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED |
8845cb37 AS |
315 | && offset >= UNITS_PER_WORD-bytes) |
316 | /* Similarly, if we can use DImode it will get matched here and | |
317 | can do an overlapping read that ends at the end of the block. */ | |
318 | return word_mode; | |
319 | else if (word_mode_ok && maxread >= UNITS_PER_WORD) | |
320 | /* It is safe to do all remaining in one load of largest size, | |
321 | possibly with a shift to get rid of unwanted bytes. */ | |
322 | return word_mode; | |
323 | else if (maxread >= GET_MODE_SIZE (SImode)) | |
324 | /* It is safe to do all remaining in one SImode load, | |
325 | possibly with a shift to get rid of unwanted bytes. */ | |
326 | return SImode; | |
327 | else if (bytes > GET_MODE_SIZE (SImode)) | |
328 | return SImode; | |
329 | else if (bytes > GET_MODE_SIZE (HImode)) | |
330 | return HImode; | |
331 | ||
332 | /* final fallback is do one byte */ | |
333 | return QImode; | |
334 | } | |
335 | ||
336 | /* Compute the alignment of pointer+OFFSET where the original alignment | |
337 | of pointer was BASE_ALIGN. */ | |
338 | static unsigned HOST_WIDE_INT | |
339 | compute_current_alignment (unsigned HOST_WIDE_INT base_align, | |
340 | unsigned HOST_WIDE_INT offset) | |
341 | { | |
342 | if (offset == 0) | |
343 | return base_align; | |
344 | return MIN (base_align, offset & -offset); | |
345 | } | |
346 | ||
5ec3397e AS |
347 | /* Prepare address and then do a load. |
348 | ||
349 | MODE is the mode to use for the load. | |
350 | DEST is the destination register for the data. | |
351 | ADDR is the address to be loaded. | |
352 | ORIG_ADDR is the original address expression. */ | |
353 | static void | |
354 | do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr, | |
355 | rtx orig_addr) | |
356 | { | |
357 | rtx mem = gen_rtx_MEM (mode, addr); | |
358 | MEM_COPY_ATTRIBUTES (mem, orig_addr); | |
359 | set_mem_size (mem, GET_MODE_SIZE (mode)); | |
360 | do_load_for_compare (dest, mem, mode); | |
361 | return; | |
362 | } | |
363 | ||
364 | /* Do a branch for an if/else decision. | |
365 | ||
366 | CMPMODE is the mode to use for the comparison. | |
367 | COMPARISON is the rtx code for the compare needed. | |
368 | A is the first thing to be compared. | |
369 | B is the second thing to be compared. | |
370 | CR is the condition code reg input, or NULL_RTX. | |
371 | TRUE_LABEL is the label to branch to if the condition is true. | |
372 | ||
373 | The return value is the CR used for the comparison. | |
374 | If CR is null_rtx, then a new register of CMPMODE is generated. | |
375 | If A and B are both null_rtx, then CR must not be null, and the | |
376 | compare is not generated so you can use this with a dot form insn. */ | |
377 | ||
378 | static void | |
379 | do_ifelse (machine_mode cmpmode, rtx_code comparison, | |
380 | rtx a, rtx b, rtx cr, rtx true_label) | |
381 | { | |
382 | gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX) | |
383 | || (a != NULL_RTX && b != NULL_RTX)); | |
384 | ||
385 | if (cr != NULL_RTX) | |
386 | gcc_assert (GET_MODE (cr) == cmpmode); | |
387 | else | |
388 | cr = gen_reg_rtx (cmpmode); | |
389 | ||
390 | rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label); | |
391 | ||
392 | if (a != NULL_RTX) | |
393 | emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b)); | |
394 | ||
395 | rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx); | |
396 | ||
397 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx); | |
398 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
399 | JUMP_LABEL (j) = true_label; | |
400 | LABEL_NUSES (true_label) += 1; | |
401 | } | |
402 | ||
403 | /* Emit an isel of the proper mode for DEST. | |
404 | ||
405 | DEST is the isel destination register. | |
406 | SRC1 is the isel source if CR is true. | |
407 | SRC2 is the isel source if CR is false. | |
408 | CR is the condition for the isel. */ | |
409 | static void | |
410 | do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr) | |
411 | { | |
412 | if (GET_MODE (dest) == DImode) | |
413 | emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr)); | |
414 | else | |
415 | emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr)); | |
416 | } | |
417 | ||
418 | /* Emit a subtract of the proper mode for DEST. | |
419 | ||
420 | DEST is the destination register for the subtract. | |
421 | SRC1 is the first subtract input. | |
422 | SRC2 is the second subtract input. | |
423 | ||
424 | Computes DEST = SRC1-SRC2. */ | |
425 | static void | |
426 | do_sub3 (rtx dest, rtx src1, rtx src2) | |
427 | { | |
428 | if (GET_MODE (dest) == DImode) | |
429 | emit_insn (gen_subdi3 (dest, src1, src2)); | |
430 | else | |
431 | emit_insn (gen_subsi3 (dest, src1, src2)); | |
432 | } | |
433 | ||
434 | /* Emit an add of the proper mode for DEST. | |
435 | ||
436 | DEST is the destination register for the add. | |
437 | SRC1 is the first add input. | |
438 | SRC2 is the second add input. | |
439 | ||
440 | Computes DEST = SRC1+SRC2. */ | |
441 | static void | |
442 | do_add3 (rtx dest, rtx src1, rtx src2) | |
443 | { | |
444 | if (GET_MODE (dest) == DImode) | |
445 | emit_insn (gen_adddi3 (dest, src1, src2)); | |
446 | else | |
447 | emit_insn (gen_addsi3 (dest, src1, src2)); | |
448 | } | |
449 | ||
f7e94dfb AS |
450 | /* Emit an and of the proper mode for DEST. |
451 | ||
452 | DEST is the destination register for the and. | |
453 | SRC1 is the first and input. | |
454 | SRC2 is the second and input. | |
455 | ||
456 | Computes DEST = SRC1&SRC2. */ | |
457 | static void | |
458 | do_and3 (rtx dest, rtx src1, rtx src2) | |
459 | { | |
460 | if (GET_MODE (dest) == DImode) | |
461 | emit_insn (gen_anddi3 (dest, src1, src2)); | |
462 | else | |
463 | emit_insn (gen_andsi3 (dest, src1, src2)); | |
464 | } | |
465 | ||
466 | /* Emit an cmpb of the proper mode for DEST. | |
467 | ||
468 | DEST is the destination register for the cmpb. | |
469 | SRC1 is the first input. | |
470 | SRC2 is the second input. | |
471 | ||
472 | Computes cmpb of SRC1, SRC2. */ | |
473 | static void | |
474 | do_cmpb3 (rtx dest, rtx src1, rtx src2) | |
475 | { | |
476 | if (GET_MODE (dest) == DImode) | |
477 | emit_insn (gen_cmpbdi3 (dest, src1, src2)); | |
478 | else | |
479 | emit_insn (gen_cmpbsi3 (dest, src1, src2)); | |
480 | } | |
481 | ||
482 | /* Emit a rotl of the proper mode for DEST. | |
483 | ||
484 | DEST is the destination register for the and. | |
485 | SRC1 is the first and input. | |
486 | SRC2 is the second and input. | |
487 | ||
488 | Computes DEST = SRC1 rotated left by SRC2. */ | |
489 | static void | |
490 | do_rotl3 (rtx dest, rtx src1, rtx src2) | |
491 | { | |
492 | if (GET_MODE (dest) == DImode) | |
493 | emit_insn (gen_rotldi3 (dest, src1, src2)); | |
494 | else | |
495 | emit_insn (gen_rotlsi3 (dest, src1, src2)); | |
496 | } | |
497 | ||
5ec3397e AS |
498 | /* Generate rtl for a load, shift, and compare of less than a full word. |
499 | ||
500 | LOAD_MODE is the machine mode for the loads. | |
501 | DIFF is the reg for the difference. | |
502 | CMP_REM is the reg containing the remaining bytes to compare. | |
503 | DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. | |
504 | SRC1_ADDR is the first source address. | |
505 | SRC2_ADDR is the second source address. | |
506 | ORIG_SRC1 is the original first source block's address rtx. | |
507 | ORIG_SRC2 is the original second source block's address rtx. */ | |
508 | static void | |
509 | do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond, | |
510 | rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2) | |
511 | { | |
512 | HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); | |
513 | rtx shift_amount = gen_reg_rtx (word_mode); | |
514 | rtx d1 = gen_reg_rtx (word_mode); | |
515 | rtx d2 = gen_reg_rtx (word_mode); | |
516 | ||
517 | do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1); | |
518 | do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2); | |
519 | do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem); | |
520 | ||
521 | if (word_mode == DImode) | |
522 | { | |
523 | emit_insn (gen_ashldi3 (shift_amount, shift_amount, | |
524 | GEN_INT (LOG2_BITS_PER_UNIT))); | |
525 | emit_insn (gen_lshrdi3 (d1, d1, | |
526 | gen_lowpart (SImode, shift_amount))); | |
527 | emit_insn (gen_lshrdi3 (d2, d2, | |
528 | gen_lowpart (SImode, shift_amount))); | |
529 | } | |
530 | else | |
531 | { | |
532 | emit_insn (gen_ashlsi3 (shift_amount, shift_amount, | |
533 | GEN_INT (LOG2_BITS_PER_UNIT))); | |
534 | emit_insn (gen_lshrsi3 (d1, d1, shift_amount)); | |
535 | emit_insn (gen_lshrsi3 (d2, d2, shift_amount)); | |
536 | } | |
537 | ||
538 | if (TARGET_P9_MISC) | |
539 | { | |
540 | /* Generate a compare, and convert with a setb later. */ | |
541 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); | |
542 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
543 | } | |
544 | else | |
545 | { | |
546 | if (word_mode == DImode) | |
547 | emit_insn (gen_subfdi3_carry (diff, d2, d1)); | |
548 | else | |
549 | emit_insn (gen_subfsi3_carry (diff, d2, d1)); | |
550 | } | |
551 | } | |
552 | ||
553 | /* Generate rtl for an overlapping load and compare of less than a | |
554 | full load_mode. This assumes that the previous word is part of the | |
555 | block being compared so it's ok to back up part of a word so we can | |
556 | compare the last unaligned full word that ends at the end of the block. | |
557 | ||
558 | LOAD_MODE is the machine mode for the loads. | |
559 | ISCONST tells whether the remaining length is a constant or in a register. | |
560 | BYTES_REM is the remaining length if ISCONST is true. | |
561 | DIFF is the reg for the difference. | |
562 | CMP_REM is the reg containing the remaining bytes to compare if !ISCONST. | |
563 | DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. | |
564 | SRC1_ADDR is the first source address. | |
565 | SRC2_ADDR is the second source address. | |
566 | ORIG_SRC1 is the original first source block's address rtx. | |
567 | ORIG_SRC2 is the original second source block's address rtx. */ | |
568 | static void | |
569 | do_overlap_load_compare (machine_mode load_mode, bool isConst, | |
570 | HOST_WIDE_INT bytes_rem, rtx diff, | |
571 | rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr, | |
572 | rtx orig_src1, rtx orig_src2) | |
573 | { | |
574 | HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); | |
575 | HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem; | |
576 | rtx d1 = gen_reg_rtx (word_mode); | |
577 | rtx d2 = gen_reg_rtx (word_mode); | |
578 | ||
579 | rtx addr1, addr2; | |
580 | if (!isConst || addr_adj) | |
581 | { | |
582 | rtx adj_reg = gen_reg_rtx (word_mode); | |
583 | if (isConst) | |
584 | emit_move_insn (adj_reg, GEN_INT (-addr_adj)); | |
585 | else | |
586 | { | |
587 | rtx reg_lms = gen_reg_rtx (word_mode); | |
588 | emit_move_insn (reg_lms, GEN_INT (load_mode_size)); | |
589 | do_sub3 (adj_reg, cmp_rem, reg_lms); | |
590 | } | |
591 | ||
592 | addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg); | |
593 | addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg); | |
594 | } | |
595 | else | |
596 | { | |
597 | addr1 = src1_addr; | |
598 | addr2 = src2_addr; | |
599 | } | |
600 | ||
601 | do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1); | |
602 | do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2); | |
603 | ||
604 | if (TARGET_P9_MISC) | |
605 | { | |
606 | /* Generate a compare, and convert with a setb later. */ | |
607 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); | |
608 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
609 | } | |
610 | else | |
611 | { | |
612 | if (word_mode == DImode) | |
613 | emit_insn (gen_subfdi3_carry (diff, d2, d1)); | |
614 | else | |
615 | emit_insn (gen_subfsi3_carry (diff, d2, d1)); | |
616 | } | |
617 | } | |
618 | ||
37ae4739 AS |
619 | /* Generate the sequence of compares for strcmp/strncmp using vec/vsx |
620 | instructions. | |
621 | ||
622 | BYTES_TO_COMPARE is the number of bytes to be compared. | |
623 | ORIG_SRC1 is the unmodified rtx for the first string. | |
624 | ORIG_SRC2 is the unmodified rtx for the second string. | |
625 | S1ADDR is the register to use for the base address of the first string. | |
626 | S2ADDR is the register to use for the base address of the second string. | |
627 | OFF_REG is the register to use for the string offset for loads. | |
628 | S1DATA is the register for loading the first string. | |
629 | S2DATA is the register for loading the second string. | |
630 | VEC_RESULT is the rtx for the vector result indicating the byte difference. | |
631 | EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call | |
632 | to strcmp/strncmp if we have equality at the end of the inline comparison. | |
633 | P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code | |
634 | to clean up and generate the final comparison result. | |
635 | FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just | |
636 | set the final result. | |
637 | CHECKZERO indicates whether the sequence should check for zero bytes | |
638 | for use doing strncmp, or not (for use doing memcmp). */ | |
639 | static void | |
640 | expand_cmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare, | |
641 | rtx orig_src1, rtx orig_src2, | |
642 | rtx s1addr, rtx s2addr, rtx off_reg, | |
643 | rtx s1data, rtx s2data, rtx vec_result, | |
644 | bool equality_compare_rest, rtx *p_cleanup_label, | |
645 | rtx final_move_label, bool checkzero) | |
646 | { | |
647 | machine_mode load_mode; | |
648 | unsigned int load_mode_size; | |
649 | unsigned HOST_WIDE_INT cmp_bytes = 0; | |
650 | unsigned HOST_WIDE_INT offset = 0; | |
651 | rtx zero_reg = NULL; | |
652 | ||
653 | gcc_assert (p_cleanup_label != NULL); | |
654 | rtx cleanup_label = *p_cleanup_label; | |
655 | ||
656 | emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0))); | |
657 | emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0))); | |
658 | ||
659 | if (checkzero && !TARGET_P9_VECTOR) | |
660 | { | |
661 | zero_reg = gen_reg_rtx (V16QImode); | |
662 | emit_move_insn (zero_reg, CONST0_RTX (V16QImode)); | |
663 | } | |
664 | ||
665 | while (bytes_to_compare > 0) | |
666 | { | |
667 | /* VEC/VSX compare sequence for P8: | |
668 | check each 16B with: | |
669 | lxvd2x 32,28,8 | |
670 | lxvd2x 33,29,8 | |
671 | vcmpequb 2,0,1 # compare strings | |
672 | vcmpequb 4,0,3 # compare w/ 0 | |
673 | xxlorc 37,36,34 # first FF byte is either mismatch or end of string | |
674 | vcmpequb. 7,5,3 # reg 7 contains 0 | |
675 | bnl 6,.Lmismatch | |
676 | ||
677 | For the P8 LE case, we use lxvd2x and compare full 16 bytes | |
678 | but then use use vgbbd and a shift to get two bytes with the | |
679 | information we need in the correct order. | |
680 | ||
681 | VEC/VSX compare sequence if TARGET_P9_VECTOR: | |
682 | lxvb16x/lxvb16x # load 16B of each string | |
683 | vcmpnezb. # produces difference location or zero byte location | |
684 | bne 6,.Lmismatch | |
685 | ||
686 | Use the overlapping compare trick for the last block if it is | |
687 | less than 16 bytes. | |
688 | */ | |
689 | ||
690 | load_mode = V16QImode; | |
691 | load_mode_size = GET_MODE_SIZE (load_mode); | |
692 | ||
693 | if (bytes_to_compare >= load_mode_size) | |
694 | cmp_bytes = load_mode_size; | |
695 | else | |
696 | { | |
697 | /* Move this load back so it doesn't go past the end. P8/P9 | |
698 | can do this efficiently. This is never called with less | |
699 | than 16 bytes so we should always be able to do this. */ | |
700 | unsigned int extra_bytes = load_mode_size - bytes_to_compare; | |
701 | cmp_bytes = bytes_to_compare; | |
702 | gcc_assert (offset > extra_bytes); | |
703 | offset -= extra_bytes; | |
704 | cmp_bytes = load_mode_size; | |
705 | bytes_to_compare = cmp_bytes; | |
706 | } | |
707 | ||
708 | /* The offset currently used is always kept in off_reg so that the | |
709 | cleanup code on P8 can use it to extract the differing byte. */ | |
710 | emit_move_insn (off_reg, GEN_INT (offset)); | |
711 | ||
712 | rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg); | |
713 | do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1); | |
714 | rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg); | |
715 | do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2); | |
716 | ||
717 | /* Cases to handle. A and B are chunks of the two strings. | |
718 | 1: Not end of comparison: | |
719 | A != B: branch to cleanup code to compute result. | |
720 | A == B: next block | |
721 | 2: End of the inline comparison: | |
722 | A != B: branch to cleanup code to compute result. | |
723 | A == B: call strcmp/strncmp | |
724 | 3: compared requested N bytes: | |
725 | A == B: branch to result 0. | |
726 | A != B: cleanup code to compute result. */ | |
727 | ||
728 | unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; | |
729 | ||
730 | if (checkzero) | |
731 | { | |
732 | if (TARGET_P9_VECTOR) | |
733 | emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data)); | |
734 | else | |
735 | { | |
736 | /* Emit instructions to do comparison and zero check. */ | |
737 | rtx cmp_res = gen_reg_rtx (load_mode); | |
738 | rtx cmp_zero = gen_reg_rtx (load_mode); | |
739 | rtx cmp_combined = gen_reg_rtx (load_mode); | |
740 | emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data)); | |
741 | emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg)); | |
742 | emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res)); | |
743 | emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg)); | |
744 | } | |
745 | } | |
746 | else | |
747 | emit_insn (gen_altivec_vcmpequb_p (vec_result, s1data, s2data)); | |
748 | ||
749 | bool branch_to_cleanup = (remain > 0 || equality_compare_rest); | |
750 | rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO); | |
751 | rtx dst_label; | |
752 | rtx cmp_rtx; | |
753 | if (branch_to_cleanup) | |
754 | { | |
755 | /* Branch to cleanup code, otherwise fall through to do more | |
756 | compares. P8 and P9 use different CR bits because on P8 | |
757 | we are looking at the result of a comparsion vs a | |
758 | register of zeroes so the all-true condition means no | |
759 | difference or zero was found. On P9, vcmpnezb sets a byte | |
760 | to 0xff if there is a mismatch or zero, so the all-false | |
761 | condition indicates we found no difference or zero. */ | |
762 | if (!cleanup_label) | |
763 | cleanup_label = gen_label_rtx (); | |
764 | dst_label = cleanup_label; | |
765 | if (TARGET_P9_VECTOR && checkzero) | |
766 | cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx); | |
767 | else | |
768 | cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx); | |
769 | } | |
770 | else | |
771 | { | |
772 | /* Branch to final return or fall through to cleanup, | |
773 | result is already set to 0. */ | |
774 | dst_label = final_move_label; | |
775 | if (TARGET_P9_VECTOR && checkzero) | |
776 | cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx); | |
777 | else | |
778 | cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx); | |
779 | } | |
780 | ||
781 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); | |
782 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
783 | lab_ref, pc_rtx); | |
784 | rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
785 | JUMP_LABEL (j2) = dst_label; | |
786 | LABEL_NUSES (dst_label) += 1; | |
787 | ||
788 | offset += cmp_bytes; | |
789 | bytes_to_compare -= cmp_bytes; | |
790 | } | |
791 | *p_cleanup_label = cleanup_label; | |
792 | return; | |
793 | } | |
794 | ||
795 | /* Generate the final sequence that identifies the differing | |
796 | byte and generates the final result, taking into account | |
797 | zero bytes: | |
798 | ||
799 | P8: | |
800 | vgbbd 0,0 | |
801 | vsldoi 0,0,0,9 | |
802 | mfvsrd 9,32 | |
803 | addi 10,9,-1 # count trailing zero bits | |
804 | andc 9,10,9 | |
805 | popcntd 9,9 | |
806 | lbzx 10,28,9 # use that offset to load differing byte | |
807 | lbzx 3,29,9 | |
808 | subf 3,3,10 # subtract for final result | |
809 | ||
810 | P9: | |
811 | vclzlsbb # counts trailing bytes with lsb=0 | |
812 | vextublx # extract differing byte | |
813 | ||
814 | STR1 is the reg rtx for data from string 1. | |
815 | STR2 is the reg rtx for data from string 2. | |
816 | RESULT is the reg rtx for the comparison result. | |
817 | S1ADDR is the register to use for the base address of the first string. | |
818 | S2ADDR is the register to use for the base address of the second string. | |
819 | ORIG_SRC1 is the unmodified rtx for the first string. | |
820 | ORIG_SRC2 is the unmodified rtx for the second string. | |
821 | OFF_REG is the register to use for the string offset for loads. | |
822 | VEC_RESULT is the rtx for the vector result indicating the byte difference. */ | |
823 | ||
824 | static void | |
825 | emit_final_compare_vec (rtx str1, rtx str2, rtx result, | |
826 | rtx s1addr, rtx s2addr, | |
827 | rtx orig_src1, rtx orig_src2, | |
828 | rtx off_reg, rtx vec_result) | |
829 | { | |
830 | ||
831 | if (TARGET_P9_VECTOR) | |
832 | { | |
833 | rtx diffix = gen_reg_rtx (SImode); | |
834 | rtx chr1 = gen_reg_rtx (SImode); | |
835 | rtx chr2 = gen_reg_rtx (SImode); | |
836 | rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0); | |
837 | rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0); | |
838 | emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result)); | |
839 | emit_insn (gen_vextublx (chr1, diffix, str1)); | |
840 | emit_insn (gen_vextublx (chr2, diffix, str2)); | |
841 | do_sub3 (result, chr1_di, chr2_di); | |
842 | } | |
843 | else | |
844 | { | |
845 | gcc_assert (TARGET_P8_VECTOR); | |
846 | rtx diffix = gen_reg_rtx (DImode); | |
847 | rtx result_gbbd = gen_reg_rtx (V16QImode); | |
848 | /* Since each byte of the input is either 00 or FF, the bytes in | |
849 | dw0 and dw1 after vgbbd are all identical to each other. */ | |
850 | emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result)); | |
851 | /* For LE, we shift by 9 and get BA in the low two bytes then CTZ. | |
852 | For BE, we shift by 7 and get AB in the high two bytes then CLZ. */ | |
853 | rtx result_shifted = gen_reg_rtx (V16QImode); | |
854 | int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9; | |
855 | emit_insn (gen_altivec_vsldoi_v16qi (result_shifted, result_gbbd, | |
856 | result_gbbd, GEN_INT (shift_amt))); | |
857 | ||
858 | rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0); | |
859 | emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted)); | |
860 | rtx count = gen_reg_rtx (DImode); | |
861 | ||
862 | if (BYTES_BIG_ENDIAN) | |
863 | emit_insn (gen_clzdi2 (count, diffix)); | |
864 | else | |
865 | emit_insn (gen_ctzdi2 (count, diffix)); | |
866 | ||
867 | /* P8 doesn't have a good solution for extracting one byte from | |
868 | a vsx reg like vextublx on P9 so we just compute the offset | |
869 | of the differing byte and load it from each string. */ | |
870 | do_add3 (off_reg, off_reg, count); | |
871 | ||
872 | rtx chr1 = gen_reg_rtx (QImode); | |
873 | rtx chr2 = gen_reg_rtx (QImode); | |
874 | rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg); | |
875 | do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1); | |
876 | rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg); | |
877 | do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2); | |
878 | machine_mode rmode = GET_MODE (result); | |
879 | rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0); | |
880 | rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0); | |
881 | do_sub3 (result, chr1_rm, chr2_rm); | |
882 | } | |
883 | ||
884 | return; | |
885 | } | |
886 | ||
5ec3397e AS |
887 | /* Expand a block compare operation using loop code, and return true |
888 | if successful. Return false if we should let the compiler generate | |
889 | normal code, probably a memcmp call. | |
890 | ||
891 | OPERANDS[0] is the target (result). | |
892 | OPERANDS[1] is the first source. | |
893 | OPERANDS[2] is the second source. | |
894 | OPERANDS[3] is the length. | |
895 | OPERANDS[4] is the alignment. */ | |
896 | bool | |
897 | expand_compare_loop (rtx operands[]) | |
898 | { | |
899 | rtx target = operands[0]; | |
900 | rtx orig_src1 = operands[1]; | |
901 | rtx orig_src2 = operands[2]; | |
902 | rtx bytes_rtx = operands[3]; | |
903 | rtx align_rtx = operands[4]; | |
904 | ||
905 | /* This case is complicated to handle because the subtract | |
906 | with carry instructions do not generate the 64-bit | |
907 | carry and so we must emit code to calculate it ourselves. | |
908 | We choose not to implement this yet. */ | |
909 | if (TARGET_32BIT && TARGET_POWERPC64) | |
910 | return false; | |
911 | ||
912 | /* Allow non-const length. */ | |
913 | int bytes_is_const = CONST_INT_P (bytes_rtx); | |
914 | ||
915 | /* This must be a fixed size alignment. */ | |
916 | if (!CONST_INT_P (align_rtx)) | |
917 | return false; | |
918 | ||
919 | HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; | |
920 | HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; | |
921 | HOST_WIDE_INT minalign = MIN (align1, align2); | |
922 | ||
923 | bool isP7 = (rs6000_tune == PROCESSOR_POWER7); | |
924 | ||
925 | gcc_assert (GET_MODE (target) == SImode); | |
926 | ||
927 | /* Anything to move? */ | |
928 | HOST_WIDE_INT bytes = 0; | |
929 | if (bytes_is_const) | |
930 | bytes = INTVAL (bytes_rtx); | |
931 | ||
932 | if (bytes_is_const && bytes == 0) | |
933 | return true; | |
934 | ||
935 | /* Limit the amount we compare, if known statically. */ | |
936 | HOST_WIDE_INT max_bytes; | |
937 | switch (rs6000_tune) | |
938 | { | |
939 | case PROCESSOR_POWER7: | |
940 | if (!bytes_is_const) | |
941 | if (minalign < 8) | |
942 | max_bytes = 0; | |
943 | else | |
944 | max_bytes = 128; | |
945 | else | |
946 | if (minalign < 8) | |
947 | max_bytes = 32; | |
948 | else | |
949 | max_bytes = 128; | |
950 | break; | |
951 | case PROCESSOR_POWER8: | |
952 | if (!bytes_is_const) | |
953 | max_bytes = 0; | |
954 | else | |
955 | if (minalign < 8) | |
956 | max_bytes = 128; | |
957 | else | |
958 | max_bytes = 64; | |
959 | break; | |
960 | case PROCESSOR_POWER9: | |
961 | if (bytes_is_const) | |
962 | max_bytes = 191; | |
963 | else | |
964 | max_bytes = 0; | |
965 | break; | |
966 | default: | |
967 | max_bytes = 128; | |
968 | } | |
969 | ||
970 | /* Allow the option to override the default. */ | |
971 | if (rs6000_block_compare_inline_loop_limit >= 0) | |
972 | max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit; | |
973 | ||
974 | if (max_bytes == 0) | |
975 | return false; | |
976 | ||
977 | rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */ | |
978 | rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */ | |
979 | HOST_WIDE_INT niter; | |
980 | rtx iter = gen_reg_rtx (word_mode); | |
981 | rtx iv1 = gen_reg_rtx (word_mode); | |
982 | rtx iv2 = gen_reg_rtx (word_mode); | |
983 | rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */ | |
984 | rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */ | |
985 | rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */ | |
986 | rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */ | |
987 | ||
988 | /* Strip unneeded subreg from length if there is one. */ | |
989 | if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx)) | |
990 | bytes_rtx = SUBREG_REG (bytes_rtx); | |
991 | /* Extend bytes_rtx to word_mode if needed. But, we expect only to | |
992 | maybe have to deal with the case were bytes_rtx is SImode and | |
993 | word_mode is DImode. */ | |
994 | if (!bytes_is_const) | |
995 | { | |
996 | if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode)) | |
997 | /* Do not expect length longer than word_mode. */ | |
ef4adf1f | 998 | return false; |
5ec3397e AS |
999 | else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode)) |
1000 | { | |
1001 | bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); | |
1002 | bytes_rtx = force_reg (word_mode, | |
1003 | gen_rtx_fmt_e (ZERO_EXTEND, word_mode, | |
1004 | bytes_rtx)); | |
1005 | } | |
1006 | else | |
1007 | /* Make sure it's in a register before we get started. */ | |
1008 | bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); | |
1009 | } | |
1010 | ||
1011 | machine_mode load_mode = word_mode; | |
1012 | HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); | |
1013 | ||
1014 | /* Number of bytes per iteration of the unrolled loop. */ | |
1015 | HOST_WIDE_INT loop_bytes = 2 * load_mode_size; | |
1016 | /* max iters and bytes compared in the loop. */ | |
1017 | HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes; | |
1018 | HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes; | |
1019 | int l2lb = floor_log2 (loop_bytes); | |
1020 | ||
1021 | if (bytes_is_const && (max_bytes < load_mode_size | |
1022 | || !IN_RANGE (bytes, load_mode_size, max_bytes))) | |
1023 | return false; | |
1024 | ||
1025 | bool no_remainder_code = false; | |
1026 | rtx final_label = gen_label_rtx (); | |
1027 | rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1028 | rtx diff_label = gen_label_rtx (); | |
1029 | rtx library_call_label = NULL; | |
1030 | rtx cleanup_label = gen_label_rtx (); | |
1031 | ||
1032 | rtx cr; | |
1033 | ||
1034 | rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0)); | |
1035 | rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0)); | |
1036 | ||
1037 | /* Difference found is stored here before jump to diff_label. */ | |
1038 | rtx diff = gen_reg_rtx (word_mode); | |
1039 | rtx j; | |
1040 | ||
1041 | /* Example of generated code for 35 bytes aligned 1 byte. | |
ef4adf1f | 1042 | |
5ec3397e AS |
1043 | mtctr 8 |
1044 | li 6,0 | |
1045 | li 5,8 | |
1046 | .L13: | |
1047 | ldbrx 7,3,6 | |
1048 | ldbrx 9,10,6 | |
1049 | ldbrx 0,3,5 | |
1050 | ldbrx 4,10,5 | |
1051 | addi 6,6,16 | |
1052 | addi 5,5,16 | |
1053 | subfc. 9,9,7 | |
1054 | bne 0,.L10 | |
1055 | subfc. 9,4,0 | |
1056 | bdnzt 2,.L13 | |
1057 | bne 0,.L10 | |
1058 | add 3,3,6 | |
1059 | add 10,10,6 | |
1060 | addi 9,3,-5 | |
1061 | ldbrx 7,0,9 | |
1062 | addi 9,10,-5 | |
1063 | ldbrx 9,0,9 | |
1064 | subfc 9,9,7 | |
1065 | .p2align 4,,15 | |
1066 | .L10: | |
1067 | popcntd 9,9 | |
1068 | subfe 10,10,10 | |
1069 | or 9,9,10 | |
ef4adf1f | 1070 | |
5ec3397e AS |
1071 | Compiled with -fno-reorder-blocks for clarity. */ |
1072 | ||
1073 | /* Structure of what we're going to do: | |
1074 | Two separate lengths: what we will compare before bailing to library | |
1075 | call (max_bytes), and the total length to be checked. | |
1076 | if length <= 16, branch to linear cleanup code starting with | |
1077 | remainder length check (length not known at compile time) | |
1078 | set up 2 iv's and load count reg, compute remainder length | |
1079 | unrollx2 compare loop | |
1080 | if loop exit due to a difference, branch to difference handling code | |
1081 | if remainder length < 8, branch to final cleanup compare | |
1082 | load and compare 8B | |
1083 | final cleanup comparison (depends on alignment and length) | |
1084 | load 8B, shift off bytes past length, compare | |
1085 | load 8B ending at last byte and compare | |
1086 | load/compare 1 byte at a time (short block abutting 4k boundary) | |
1087 | difference handling, 64->32 conversion | |
1088 | final result | |
1089 | branch around memcmp call | |
1090 | memcmp library call | |
1091 | */ | |
1092 | ||
1093 | /* If bytes is not const, compare length and branch directly | |
1094 | to the cleanup code that can handle 0-16 bytes if length | |
1095 | is >= 16. Stash away bytes-max_bytes for the library call. */ | |
1096 | if (bytes_is_const) | |
1097 | { | |
1098 | /* These need to be set for some of the places we may jump to. */ | |
1099 | if (bytes > max_bytes) | |
1100 | { | |
1101 | no_remainder_code = true; | |
1102 | niter = max_loop_iter; | |
1103 | library_call_label = gen_label_rtx (); | |
1104 | } | |
1105 | else | |
1106 | { | |
1107 | niter = bytes / loop_bytes; | |
1108 | } | |
1109 | emit_move_insn (iter, GEN_INT (niter)); | |
1110 | emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes)); | |
1111 | emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes)); | |
1112 | } | |
1113 | else | |
1114 | { | |
1115 | library_call_label = gen_label_rtx (); | |
1116 | ||
1117 | /* If we go to the cleanup code, it expects length to be in cmp_rem. */ | |
1118 | emit_move_insn (cmp_rem, bytes_rtx); | |
1119 | ||
1120 | /* Check for > max_bytes bytes. We want to bail out as quickly as | |
1121 | possible if we have to go over to memcmp. */ | |
1122 | do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes), | |
1123 | NULL_RTX, library_call_label); | |
1124 | ||
1125 | /* Check for < loop_bytes bytes. */ | |
1126 | do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes), | |
1127 | NULL_RTX, cleanup_label); | |
1128 | ||
1129 | /* Loop compare bytes and iterations if bytes>max_bytes. */ | |
1130 | rtx mb_reg = gen_reg_rtx (word_mode); | |
1131 | emit_move_insn (mb_reg, GEN_INT (max_loop_bytes)); | |
1132 | rtx mi_reg = gen_reg_rtx (word_mode); | |
1133 | emit_move_insn (mi_reg, GEN_INT (max_loop_iter)); | |
1134 | ||
1135 | /* Compute number of loop iterations if bytes <= max_bytes. */ | |
1136 | if (word_mode == DImode) | |
1137 | emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb))); | |
1138 | else | |
1139 | emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb))); | |
1140 | ||
1141 | /* Compute bytes to compare in loop if bytes <= max_bytes. */ | |
1142 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb); | |
1143 | if (word_mode == DImode) | |
1144 | { | |
1145 | emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask)); | |
1146 | } | |
1147 | else | |
1148 | { | |
1149 | emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask)); | |
1150 | } | |
1151 | ||
1152 | /* Check for bytes <= max_bytes. */ | |
1153 | if (TARGET_ISEL) | |
1154 | { | |
1155 | /* P9 has fast isel so we use one compare and two isel. */ | |
1156 | cr = gen_reg_rtx (CCmode); | |
1157 | rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx, | |
1158 | GEN_INT (max_bytes)); | |
1159 | emit_move_insn (cr, compare_rtx); | |
1160 | rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx); | |
1161 | do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr); | |
1162 | do_isel (iter, cmp_rtx, iter, mi_reg, cr); | |
1163 | } | |
1164 | else | |
1165 | { | |
1166 | rtx lab_after = gen_label_rtx (); | |
1167 | do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes), | |
1168 | NULL_RTX, lab_after); | |
1169 | emit_move_insn (loop_cmp, mb_reg); | |
1170 | emit_move_insn (iter, mi_reg); | |
1171 | emit_label (lab_after); | |
1172 | } | |
1173 | ||
1174 | /* Now compute remainder bytes which isn't used until after the loop. */ | |
1175 | do_sub3 (cmp_rem, bytes_rtx, loop_cmp); | |
1176 | } | |
1177 | ||
1178 | rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */ | |
1179 | /* For p9 we need to have just one of these as multiple places define | |
1180 | it and it gets used by the setb at the end. */ | |
1181 | if (TARGET_P9_MISC) | |
1182 | dcond = gen_reg_rtx (CCUNSmode); | |
1183 | ||
1184 | if (!bytes_is_const || bytes >= loop_bytes) | |
1185 | { | |
1186 | /* It should not be possible to come here if remaining bytes is | |
1187 | < 16 in the runtime case either. Compute number of loop | |
1188 | iterations. We compare 2*word_mode per iteration so 16B for | |
1189 | 64-bit code and 8B for 32-bit. Set up two induction | |
1190 | variables and load count register. */ | |
1191 | ||
1192 | /* HACK ALERT: create hard reg for CTR here. If we just use a | |
1193 | pseudo, cse will get rid of it and then the allocator will | |
1194 | see it used in the lshr above and won't give us ctr. */ | |
1195 | rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); | |
1196 | emit_move_insn (ctr, iter); | |
1197 | emit_move_insn (diff, GEN_INT (0)); | |
1198 | emit_move_insn (iv1, GEN_INT (0)); | |
1199 | emit_move_insn (iv2, GEN_INT (load_mode_size)); | |
1200 | ||
1201 | /* inner loop to compare 2*word_mode */ | |
1202 | rtx loop_top_label = gen_label_rtx (); | |
1203 | emit_label (loop_top_label); | |
1204 | ||
1205 | rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1); | |
1206 | rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1); | |
1207 | ||
1208 | do_load_for_compare_from_addr (load_mode, d1_1, | |
1209 | src1_ix1, orig_src1); | |
1210 | do_load_for_compare_from_addr (load_mode, d2_1, | |
1211 | src2_ix1, orig_src2); | |
1212 | do_add3 (iv1, iv1, GEN_INT (loop_bytes)); | |
1213 | ||
1214 | rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2); | |
1215 | rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2); | |
1216 | ||
1217 | do_load_for_compare_from_addr (load_mode, d1_2, | |
1218 | src1_ix2, orig_src1); | |
1219 | do_load_for_compare_from_addr (load_mode, d2_2, | |
1220 | src2_ix2, orig_src2); | |
1221 | do_add3 (iv2, iv2, GEN_INT (loop_bytes)); | |
1222 | ||
1223 | if (TARGET_P9_MISC) | |
1224 | { | |
1225 | /* Generate a compare, and convert with a setb later. */ | |
1226 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); | |
1227 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
1228 | } | |
1229 | else | |
1230 | { | |
1231 | dcond = gen_reg_rtx (CCmode); | |
1232 | if (word_mode == DImode) | |
1233 | emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
1234 | else | |
1235 | emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
1236 | } | |
1237 | ||
1238 | do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, | |
1239 | dcond, diff_label); | |
1240 | ||
1241 | if (TARGET_P9_MISC) | |
1242 | { | |
1243 | /* Generate a compare, and convert with a setb later. */ | |
1244 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2); | |
1245 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
1246 | } | |
1247 | else | |
1248 | { | |
1249 | dcond = gen_reg_rtx (CCmode); | |
1250 | if (word_mode == DImode) | |
1251 | emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond)); | |
1252 | else | |
1253 | emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond)); | |
1254 | } | |
1255 | ||
1256 | rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2); | |
1257 | if (TARGET_64BIT) | |
1258 | j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr, | |
1259 | eqrtx, dcond)); | |
1260 | else | |
1261 | j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr, | |
1262 | eqrtx, dcond)); | |
1263 | JUMP_LABEL (j) = loop_top_label; | |
1264 | LABEL_NUSES (loop_top_label) += 1; | |
1265 | } | |
1266 | ||
1267 | HOST_WIDE_INT bytes_remaining = 0; | |
1268 | if (bytes_is_const) | |
1269 | bytes_remaining = (bytes % loop_bytes); | |
1270 | ||
1271 | /* If diff is nonzero, branch to difference handling | |
1272 | code. If we exit here with a nonzero diff, it is | |
1273 | because the second word differed. */ | |
1274 | if (TARGET_P9_MISC) | |
1275 | do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, diff_label); | |
1276 | else | |
1277 | do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, diff_label); | |
1278 | ||
1279 | if (library_call_label != NULL && bytes_is_const && bytes > max_bytes) | |
1280 | { | |
1281 | /* If the length is known at compile time, then we will always | |
1282 | have a remainder to go to the library call with. */ | |
1283 | rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label); | |
1284 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref)); | |
1285 | JUMP_LABEL (j) = library_call_label; | |
1286 | LABEL_NUSES (library_call_label) += 1; | |
1287 | emit_barrier (); | |
1288 | } | |
1289 | ||
1290 | if (bytes_is_const && bytes_remaining == 0) | |
1291 | { | |
1292 | /* No remainder and if we are here then diff is 0 so just return 0 */ | |
1293 | if (TARGET_64BIT) | |
1294 | emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); | |
1295 | else | |
1296 | emit_move_insn (target, diff); | |
1297 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); | |
1298 | JUMP_LABEL (j) = final_label; | |
1299 | LABEL_NUSES (final_label) += 1; | |
1300 | emit_barrier (); | |
1301 | } | |
1302 | else if (!no_remainder_code) | |
1303 | { | |
1304 | /* Update addresses to point to the next word to examine. */ | |
1305 | do_add3 (src1_addr, src1_addr, iv1); | |
1306 | do_add3 (src2_addr, src2_addr, iv1); | |
1307 | ||
1308 | emit_label (cleanup_label); | |
1309 | ||
1310 | if (!bytes_is_const) | |
1311 | { | |
1312 | /* If we're dealing with runtime length, we have to check if | |
ef4adf1f | 1313 | it's zero after the loop. When length is known at compile |
5ec3397e AS |
1314 | time the no-remainder condition is dealt with above. By |
1315 | doing this after cleanup_label, we also deal with the | |
1316 | case where length is 0 at the start and we bypass the | |
1317 | loop with a branch to cleanup_label. */ | |
1318 | emit_move_insn (target, const0_rtx); | |
1319 | do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, | |
1320 | NULL_RTX, final_label); | |
1321 | } | |
1322 | ||
1323 | rtx final_cleanup = gen_label_rtx (); | |
1324 | rtx cmp_rem_before = gen_reg_rtx (word_mode); | |
1325 | /* Compare one more word_mode chunk if needed. */ | |
37ca383f | 1326 | if (!bytes_is_const || bytes_remaining >= load_mode_size) |
5ec3397e AS |
1327 | { |
1328 | /* If remainder length < word length, branch to final | |
1329 | cleanup compare. */ | |
1330 | if (!bytes_is_const) | |
1331 | do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size), | |
1332 | NULL_RTX, final_cleanup); | |
1333 | ||
1334 | /* load and compare 8B */ | |
1335 | do_load_for_compare_from_addr (load_mode, d1_1, | |
1336 | src1_addr, orig_src1); | |
1337 | do_load_for_compare_from_addr (load_mode, d2_1, | |
1338 | src2_addr, orig_src2); | |
1339 | ||
1340 | /* Compare the word, see if we need to do the last partial. */ | |
1341 | if (TARGET_P9_MISC) | |
1342 | { | |
1343 | /* Generate a compare, and convert with a setb later. */ | |
1344 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); | |
1345 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
1346 | } | |
1347 | else | |
1348 | { | |
1349 | dcond = gen_reg_rtx (CCmode); | |
1350 | if (word_mode == DImode) | |
1351 | emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
1352 | else | |
1353 | emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
1354 | } | |
1355 | ||
1356 | do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, | |
1357 | dcond, diff_label); | |
1358 | ||
1359 | do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size)); | |
1360 | do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size)); | |
1361 | emit_move_insn (cmp_rem_before, cmp_rem); | |
1362 | do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size)); | |
1363 | if (bytes_is_const) | |
1364 | bytes_remaining -= load_mode_size; | |
1365 | else | |
1366 | /* See if remaining length is now zero. We previously set | |
1367 | target to 0 so we can just jump to the end. */ | |
1368 | do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, | |
1369 | NULL_RTX, final_label); | |
1370 | ||
1371 | } | |
1372 | ||
1373 | /* Cases: | |
1374 | bytes_is_const | |
1375 | We can always shift back to do an overlapping compare | |
1376 | of the last chunk because we know length >= 8. | |
1377 | ||
1378 | !bytes_is_const | |
1379 | align>=load_mode_size | |
1380 | Read word_mode and mask | |
1381 | align<load_mode_size | |
1382 | avoid stepping past end | |
1383 | ||
1384 | Three strategies: | |
1385 | * decrement address and do overlapping compare | |
1386 | * read word_mode and mask | |
1387 | * carefully avoid crossing 4k boundary | |
1388 | */ | |
1389 | ||
1390 | if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7)) | |
1391 | && align1 >= load_mode_size && align2 >= load_mode_size) | |
1392 | { | |
1393 | /* Alignment is larger than word_mode so we do not need to be | |
1394 | concerned with extra page crossings. But, we do not know | |
1395 | that the length is larger than load_mode_size so we might | |
1396 | end up compareing against data before the block if we try | |
1397 | an overlapping compare. Also we use this on P7 for fixed length | |
1398 | remainder because P7 doesn't like overlapping unaligned. | |
1399 | Strategy: load 8B, shift off bytes past length, and compare. */ | |
1400 | emit_label (final_cleanup); | |
1401 | do_load_mask_compare (load_mode, diff, cmp_rem, dcond, | |
1402 | src1_addr, src2_addr, orig_src1, orig_src2); | |
1403 | } | |
1404 | else if (bytes_remaining && bytes_is_const) | |
1405 | { | |
1406 | /* We do not do loop expand if length < 32 so we know at the | |
1407 | end we can do an overlapping compare. | |
1408 | Strategy: shift address back and do word_mode load that | |
1409 | ends at the end of the block. */ | |
1410 | emit_label (final_cleanup); | |
1411 | do_overlap_load_compare (load_mode, true, bytes_remaining, diff, | |
1412 | cmp_rem, dcond, src1_addr, src2_addr, | |
1413 | orig_src1, orig_src2); | |
1414 | } | |
1415 | else if (!bytes_is_const) | |
1416 | { | |
1417 | rtx handle4k_label = gen_label_rtx (); | |
1418 | rtx nonconst_overlap = gen_label_rtx (); | |
1419 | emit_label (nonconst_overlap); | |
1420 | ||
1421 | /* Here we have to handle the case where whe have runtime | |
1422 | length which may be too short for overlap compare, and | |
1423 | alignment is not at least load_mode_size so we have to | |
1424 | tread carefully to avoid stepping across 4k boundaries. */ | |
1425 | ||
1426 | /* If the length after the loop was larger than word_mode | |
1427 | size, we can just do an overlapping compare and we're | |
1428 | done. We fall through to this code from the word_mode | |
1429 | compare that preceeds this. */ | |
1430 | do_overlap_load_compare (load_mode, false, 0, diff, | |
1431 | cmp_rem, dcond, src1_addr, src2_addr, | |
1432 | orig_src1, orig_src2); | |
1433 | ||
1434 | rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label); | |
1435 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); | |
1436 | JUMP_LABEL (j) = diff_label; | |
1437 | LABEL_NUSES (diff_label) += 1; | |
1438 | emit_barrier (); | |
1439 | ||
1440 | /* If we couldn't do the overlap compare we have to be more | |
1441 | careful of the 4k boundary. Test to see if either | |
1442 | address is less than word_mode_size away from a 4k | |
1443 | boundary. If not, then we can do a load/shift/compare | |
1444 | and we are done. We come to this code if length was less | |
1445 | than word_mode_size. */ | |
1446 | ||
1447 | emit_label (final_cleanup); | |
1448 | ||
1449 | /* We can still avoid the slow case if the length was larger | |
1450 | than one loop iteration, in which case go do the overlap | |
1451 | load compare path. */ | |
1452 | do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes), | |
1453 | NULL_RTX, nonconst_overlap); | |
1454 | ||
1455 | rtx rem4k = gen_reg_rtx (word_mode); | |
1456 | rtx dist1 = gen_reg_rtx (word_mode); | |
1457 | rtx dist2 = gen_reg_rtx (word_mode); | |
1458 | do_sub3 (rem4k, GEN_INT (4096), cmp_rem); | |
1459 | if (word_mode == SImode) | |
1460 | emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff))); | |
1461 | else | |
1462 | emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff))); | |
1463 | do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, handle4k_label); | |
1464 | if (word_mode == SImode) | |
1465 | emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff))); | |
1466 | else | |
1467 | emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff))); | |
1468 | do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, handle4k_label); | |
1469 | ||
1470 | /* We don't have a 4k boundary to deal with, so do | |
1471 | a load/shift/compare and jump to diff. */ | |
1472 | ||
1473 | do_load_mask_compare (load_mode, diff, cmp_rem, dcond, | |
1474 | src1_addr, src2_addr, orig_src1, orig_src2); | |
1475 | ||
1476 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); | |
1477 | JUMP_LABEL (j) = diff_label; | |
1478 | LABEL_NUSES (diff_label) += 1; | |
1479 | emit_barrier (); | |
1480 | ||
1481 | /* Finally in the unlikely case we are inching up to a | |
1482 | 4k boundary we use a compact lbzx/compare loop to do | |
1483 | it a byte at a time. */ | |
1484 | ||
1485 | emit_label (handle4k_label); | |
1486 | ||
1487 | rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); | |
1488 | emit_move_insn (ctr, cmp_rem); | |
1489 | rtx ixreg = gen_reg_rtx (Pmode); | |
1490 | emit_move_insn (ixreg, const0_rtx); | |
1491 | ||
1492 | rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg); | |
1493 | rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg); | |
1494 | rtx d1 = gen_reg_rtx (word_mode); | |
1495 | rtx d2 = gen_reg_rtx (word_mode); | |
1496 | ||
1497 | rtx fc_loop = gen_label_rtx (); | |
1498 | emit_label (fc_loop); | |
1499 | ||
1500 | do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1); | |
1501 | do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2); | |
1502 | ||
1503 | do_add3 (ixreg, ixreg, const1_rtx); | |
1504 | ||
1505 | rtx cond = gen_reg_rtx (CCmode); | |
1506 | rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2); | |
1507 | rs6000_emit_dot_insn (diff, subexpr, 2, cond); | |
1508 | ||
1509 | rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2); | |
1510 | if (TARGET_64BIT) | |
1511 | j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr, | |
1512 | eqrtx, cond)); | |
1513 | else | |
1514 | j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr, | |
1515 | eqrtx, cond)); | |
1516 | JUMP_LABEL (j) = fc_loop; | |
1517 | LABEL_NUSES (fc_loop) += 1; | |
1518 | ||
1519 | if (TARGET_64BIT) | |
1520 | emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); | |
1521 | else | |
1522 | emit_move_insn (target, diff); | |
1523 | ||
1524 | /* Since we are comparing bytes, the difference can be used | |
1525 | as the final result and we are done here. */ | |
1526 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); | |
1527 | JUMP_LABEL (j) = final_label; | |
1528 | LABEL_NUSES (final_label) += 1; | |
1529 | emit_barrier (); | |
1530 | } | |
1531 | } | |
1532 | ||
1533 | emit_label (diff_label); | |
1534 | /* difference handling, 64->32 conversion */ | |
1535 | ||
1536 | /* We need to produce DI result from sub, then convert to target SI | |
1537 | while maintaining <0 / ==0 / >0 properties. This sequence works: | |
1538 | subfc L,A,B | |
1539 | subfe H,H,H | |
1540 | popcntd L,L | |
1541 | rldimi L,H,6,0 | |
1542 | ||
1543 | This is an alternate one Segher cooked up if somebody | |
1544 | wants to expand this for something that doesn't have popcntd: | |
1545 | subfc L,a,b | |
1546 | subfe H,x,x | |
1547 | addic t,L,-1 | |
1548 | subfe v,t,L | |
1549 | or z,v,H | |
1550 | ||
1551 | And finally, p9 can just do this: | |
1552 | cmpld A,B | |
1553 | setb r */ | |
1554 | ||
1555 | if (TARGET_P9_MISC) | |
1556 | emit_insn (gen_setb_unsigned (target, dcond)); | |
1557 | else | |
1558 | { | |
1559 | if (TARGET_64BIT) | |
1560 | { | |
1561 | rtx tmp_reg_ca = gen_reg_rtx (DImode); | |
1562 | emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); | |
1563 | emit_insn (gen_popcntddi2 (diff, diff)); | |
1564 | emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca)); | |
1565 | emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); | |
1566 | } | |
1567 | else | |
1568 | { | |
1569 | rtx tmp_reg_ca = gen_reg_rtx (SImode); | |
1570 | emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); | |
1571 | emit_insn (gen_popcntdsi2 (diff, diff)); | |
1572 | emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca)); | |
1573 | } | |
1574 | } | |
1575 | ||
1576 | if (library_call_label != NULL) | |
1577 | { | |
1578 | /* Branch around memcmp call. */ | |
1579 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); | |
1580 | JUMP_LABEL (j) = final_label; | |
1581 | LABEL_NUSES (final_label) += 1; | |
1582 | emit_barrier (); | |
1583 | ||
1584 | /* Make memcmp library call. cmp_rem is the remaining bytes that | |
1585 | were compared and cmp_rem is the expected amount to be compared | |
1586 | by memcmp. If we don't find a difference in the loop compare, do | |
1587 | the library call directly instead of doing a small compare just | |
1588 | to get to an arbitrary boundary before calling it anyway. | |
1589 | Also, update addresses to point to the next word to examine. */ | |
1590 | emit_label (library_call_label); | |
1591 | ||
1592 | rtx len_rtx = gen_reg_rtx (word_mode); | |
1593 | if (bytes_is_const) | |
1594 | { | |
1595 | emit_move_insn (len_rtx, cmp_rem); | |
1596 | do_add3 (src1_addr, src1_addr, iv1); | |
1597 | do_add3 (src2_addr, src2_addr, iv1); | |
1598 | } | |
1599 | else | |
1600 | emit_move_insn (len_rtx, bytes_rtx); | |
1601 | ||
1602 | tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP); | |
1603 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
1604 | target, LCT_NORMAL, GET_MODE (target), | |
1605 | src1_addr, Pmode, | |
1606 | src2_addr, Pmode, | |
1607 | len_rtx, GET_MODE (len_rtx)); | |
1608 | } | |
1609 | ||
1610 | /* emit final_label */ | |
1611 | emit_label (final_label); | |
1612 | return true; | |
1613 | } | |
1614 | ||
37ae4739 AS |
1615 | /* Generate code to convert a DImode-plus-carry subtract result into |
1616 | a SImode result that has the same <0 / ==0 / >0 properties to | |
1617 | produce the final result from memcmp. | |
8845cb37 | 1618 | |
37ae4739 AS |
1619 | TARGET is the rtx for the register to receive the memcmp result. |
1620 | SUB_RESULT is the rtx for the register contining the subtract result. */ | |
8845cb37 | 1621 | |
37ae4739 AS |
1622 | void |
1623 | generate_6432_conversion(rtx target, rtx sub_result) | |
1624 | { | |
1625 | /* We need to produce DI result from sub, then convert to target SI | |
1626 | while maintaining <0 / ==0 / >0 properties. This sequence works: | |
1627 | subfc L,A,B | |
1628 | subfe H,H,H | |
1629 | popcntd L,L | |
1630 | rldimi L,H,6,0 | |
8845cb37 | 1631 | |
37ae4739 AS |
1632 | This is an alternate one Segher cooked up if somebody |
1633 | wants to expand this for something that doesn't have popcntd: | |
1634 | subfc L,a,b | |
1635 | subfe H,x,x | |
1636 | addic t,L,-1 | |
1637 | subfe v,t,L | |
1638 | or z,v,H | |
8845cb37 | 1639 | |
37ae4739 AS |
1640 | And finally, p9 can just do this: |
1641 | cmpld A,B | |
1642 | setb r */ | |
8845cb37 | 1643 | |
37ae4739 AS |
1644 | if (TARGET_64BIT) |
1645 | { | |
1646 | rtx tmp_reg_ca = gen_reg_rtx (DImode); | |
1647 | emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); | |
1648 | rtx popcnt = gen_reg_rtx (DImode); | |
1649 | emit_insn (gen_popcntddi2 (popcnt, sub_result)); | |
1650 | rtx tmp2 = gen_reg_rtx (DImode); | |
1651 | emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca)); | |
1652 | emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2))); | |
1653 | } | |
8845cb37 | 1654 | else |
37ae4739 AS |
1655 | { |
1656 | rtx tmp_reg_ca = gen_reg_rtx (SImode); | |
1657 | emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); | |
1658 | rtx popcnt = gen_reg_rtx (SImode); | |
1659 | emit_insn (gen_popcntdsi2 (popcnt, sub_result)); | |
1660 | emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca)); | |
1661 | } | |
1662 | } | |
8845cb37 | 1663 | |
37ae4739 AS |
1664 | /* Generate memcmp expansion using in-line non-loop GPR instructions. |
1665 | The bool return indicates whether code for a 64->32 conversion | |
1666 | should be generated. | |
1667 | ||
1668 | BYTES is the number of bytes to be compared. | |
1669 | BASE_ALIGN is the minimum alignment for both blocks to compare. | |
1670 | ORIG_SRC1 is the original pointer to the first block to compare. | |
1671 | ORIG_SRC2 is the original pointer to the second block to compare. | |
1672 | SUB_RESULT is the reg rtx for the result from the final subtract. | |
1673 | COND is rtx for a condition register that will be used for the final | |
1674 | compare on power9 or better. | |
1675 | FINAL_RESULT is the reg rtx for the final memcmp result. | |
1676 | P_CONVERT_LABEL is a pointer to rtx that will be used to store the | |
1677 | label generated for a branch to the 64->32 code, if such a branch | |
1678 | is needed. | |
1679 | P_FINAL_LABEL is a pointer to rtx that will be used to store the label | |
1680 | for the end of the memcmp if a branch there is needed. | |
1681 | */ | |
8845cb37 | 1682 | |
37ae4739 AS |
1683 | bool |
1684 | expand_block_compare_gpr(unsigned HOST_WIDE_INT bytes, unsigned int base_align, | |
1685 | rtx orig_src1, rtx orig_src2, | |
1686 | rtx sub_result, rtx cond, rtx final_result, | |
1687 | rtx *p_convert_label, rtx *p_final_label) | |
1688 | { | |
8845cb37 AS |
1689 | /* Example of generated code for 18 bytes aligned 1 byte. |
1690 | Compiled with -fno-reorder-blocks for clarity. | |
1691 | ldbrx 10,31,8 | |
1692 | ldbrx 9,7,8 | |
1693 | subfc. 9,9,10 | |
1694 | bne 0,.L6487 | |
1695 | addi 9,12,8 | |
1696 | addi 5,11,8 | |
1697 | ldbrx 10,0,9 | |
1698 | ldbrx 9,0,5 | |
1699 | subfc. 9,9,10 | |
1700 | bne 0,.L6487 | |
1701 | addi 9,12,16 | |
1702 | lhbrx 10,0,9 | |
1703 | addi 9,11,16 | |
1704 | lhbrx 9,0,9 | |
1705 | subf 9,9,10 | |
1706 | b .L6488 | |
1707 | .p2align 4,,15 | |
1708 | .L6487: #convert_label | |
1709 | popcntd 9,9 | |
1710 | subfe 10,10,10 | |
1711 | or 9,9,10 | |
1712 | .L6488: #final_label | |
1713 | extsw 10,9 | |
1714 | ||
1715 | We start off with DImode for two blocks that jump to the DI->SI conversion | |
1716 | if the difference is found there, then a final block of HImode that skips | |
1717 | the DI->SI conversion. */ | |
1718 | ||
37ae4739 AS |
1719 | unsigned HOST_WIDE_INT offset = 0; |
1720 | unsigned int load_mode_size; | |
1721 | HOST_WIDE_INT cmp_bytes = 0; | |
1722 | rtx src1 = orig_src1; | |
1723 | rtx src2 = orig_src2; | |
1724 | rtx tmp_reg_src1 = gen_reg_rtx (word_mode); | |
1725 | rtx tmp_reg_src2 = gen_reg_rtx (word_mode); | |
1726 | bool need_6432_conv = false; | |
1727 | rtx convert_label = NULL; | |
1728 | rtx final_label = NULL; | |
1729 | machine_mode load_mode; | |
1730 | ||
8845cb37 AS |
1731 | while (bytes > 0) |
1732 | { | |
1733 | unsigned int align = compute_current_alignment (base_align, offset); | |
74f9986e | 1734 | load_mode = select_block_compare_mode (offset, bytes, align); |
8845cb37 AS |
1735 | load_mode_size = GET_MODE_SIZE (load_mode); |
1736 | if (bytes >= load_mode_size) | |
1737 | cmp_bytes = load_mode_size; | |
1738 | else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
1739 | { | |
1740 | /* Move this load back so it doesn't go past the end. | |
1741 | P8/P9 can do this efficiently. */ | |
1742 | unsigned int extra_bytes = load_mode_size - bytes; | |
1743 | cmp_bytes = bytes; | |
1744 | if (extra_bytes < offset) | |
1745 | { | |
1746 | offset -= extra_bytes; | |
1747 | cmp_bytes = load_mode_size; | |
1748 | bytes = cmp_bytes; | |
1749 | } | |
1750 | } | |
1751 | else | |
1752 | /* P7 and earlier can't do the overlapping load trick fast, | |
1753 | so this forces a non-overlapping load and a shift to get | |
1754 | rid of the extra bytes. */ | |
1755 | cmp_bytes = bytes; | |
1756 | ||
1757 | src1 = adjust_address (orig_src1, load_mode, offset); | |
1758 | src2 = adjust_address (orig_src2, load_mode, offset); | |
1759 | ||
1760 | if (!REG_P (XEXP (src1, 0))) | |
1761 | { | |
1762 | rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
1763 | src1 = replace_equiv_address (src1, src1_reg); | |
1764 | } | |
f4f867ca | 1765 | set_mem_size (src1, load_mode_size); |
8845cb37 AS |
1766 | |
1767 | if (!REG_P (XEXP (src2, 0))) | |
1768 | { | |
1769 | rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
1770 | src2 = replace_equiv_address (src2, src2_reg); | |
1771 | } | |
f4f867ca | 1772 | set_mem_size (src2, load_mode_size); |
8845cb37 AS |
1773 | |
1774 | do_load_for_compare (tmp_reg_src1, src1, load_mode); | |
1775 | do_load_for_compare (tmp_reg_src2, src2, load_mode); | |
1776 | ||
1777 | if (cmp_bytes < load_mode_size) | |
1778 | { | |
1779 | /* Shift unneeded bytes off. */ | |
1780 | rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes)); | |
1781 | if (word_mode == DImode) | |
1782 | { | |
1783 | emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
1784 | emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
1785 | } | |
1786 | else | |
1787 | { | |
1788 | emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
1789 | emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
1790 | } | |
1791 | } | |
1792 | ||
1793 | int remain = bytes - cmp_bytes; | |
37ae4739 | 1794 | if (GET_MODE_SIZE (GET_MODE (final_result)) > GET_MODE_SIZE (load_mode)) |
8845cb37 | 1795 | { |
37ae4739 | 1796 | /* Final_result is larger than load size so we don't need to |
8845cb37 AS |
1797 | reduce result size. */ |
1798 | ||
1799 | /* We previously did a block that need 64->32 conversion but | |
1800 | the current block does not, so a label is needed to jump | |
1801 | to the end. */ | |
37ae4739 | 1802 | if (need_6432_conv && !final_label) |
8845cb37 AS |
1803 | final_label = gen_label_rtx (); |
1804 | ||
1805 | if (remain > 0) | |
1806 | { | |
1807 | /* This is not the last block, branch to the end if the result | |
1808 | of this subtract is not zero. */ | |
1809 | if (!final_label) | |
1810 | final_label = gen_label_rtx (); | |
1811 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1812 | rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); | |
1813 | rtx cr = gen_reg_rtx (CCmode); | |
1814 | rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr); | |
37ae4739 | 1815 | emit_insn (gen_movsi (final_result, |
8845cb37 AS |
1816 | gen_lowpart (SImode, tmp_reg_src2))); |
1817 | rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx); | |
1818 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, | |
1819 | fin_ref, pc_rtx); | |
1820 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
1821 | JUMP_LABEL (j) = final_label; | |
1822 | LABEL_NUSES (final_label) += 1; | |
1823 | } | |
1824 | else | |
1825 | { | |
1826 | if (word_mode == DImode) | |
1827 | { | |
1828 | emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1, | |
1829 | tmp_reg_src2)); | |
37ae4739 | 1830 | emit_insn (gen_movsi (final_result, |
8845cb37 AS |
1831 | gen_lowpart (SImode, tmp_reg_src2))); |
1832 | } | |
1833 | else | |
37ae4739 | 1834 | emit_insn (gen_subsi3 (final_result, tmp_reg_src1, tmp_reg_src2)); |
8845cb37 AS |
1835 | |
1836 | if (final_label) | |
1837 | { | |
1838 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1839 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
5ec3397e | 1840 | JUMP_LABEL (j) = final_label; |
8845cb37 AS |
1841 | LABEL_NUSES (final_label) += 1; |
1842 | emit_barrier (); | |
1843 | } | |
1844 | } | |
1845 | } | |
1846 | else | |
1847 | { | |
1848 | /* Do we need a 64->32 conversion block? We need the 64->32 | |
37ae4739 | 1849 | conversion even if final_result size == load_mode size because |
8845cb37 | 1850 | the subtract generates one extra bit. */ |
37ae4739 | 1851 | need_6432_conv = true; |
8845cb37 AS |
1852 | |
1853 | if (remain > 0) | |
1854 | { | |
1855 | if (!convert_label) | |
1856 | convert_label = gen_label_rtx (); | |
1857 | ||
1858 | /* Compare to zero and branch to convert_label if not zero. */ | |
1859 | rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label); | |
1860 | if (TARGET_P9_MISC) | |
1861 | { | |
37ae4739 AS |
1862 | /* Generate a compare, and convert with a setb later. |
1863 | Use cond that is passed in because the caller needs | |
1864 | to use it for the 64->32 conversion later. */ | |
8845cb37 AS |
1865 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, |
1866 | tmp_reg_src2); | |
1867 | emit_insn (gen_rtx_SET (cond, cmp)); | |
1868 | } | |
1869 | else | |
37ae4739 AS |
1870 | { |
1871 | /* Generate a subfc. and use the longer sequence for | |
1872 | conversion. Cond is not used outside this | |
1873 | function in this case. */ | |
1874 | cond = gen_reg_rtx (CCmode); | |
1875 | if (TARGET_64BIT) | |
1876 | emit_insn (gen_subfdi3_carry_dot2 (sub_result, tmp_reg_src2, | |
1877 | tmp_reg_src1, cond)); | |
1878 | else | |
1879 | emit_insn (gen_subfsi3_carry_dot2 (sub_result, tmp_reg_src2, | |
1880 | tmp_reg_src1, cond)); | |
1881 | } | |
1882 | ||
8845cb37 AS |
1883 | rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); |
1884 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, | |
1885 | cvt_ref, pc_rtx); | |
1886 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
5ec3397e | 1887 | JUMP_LABEL (j) = convert_label; |
8845cb37 AS |
1888 | LABEL_NUSES (convert_label) += 1; |
1889 | } | |
1890 | else | |
1891 | { | |
1892 | /* Just do the subtract/compare. Since this is the last block | |
1893 | the convert code will be generated immediately following. */ | |
1894 | if (TARGET_P9_MISC) | |
1895 | { | |
1896 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, | |
1897 | tmp_reg_src2); | |
1898 | emit_insn (gen_rtx_SET (cond, cmp)); | |
1899 | } | |
1900 | else | |
1901 | if (TARGET_64BIT) | |
37ae4739 | 1902 | emit_insn (gen_subfdi3_carry (sub_result, tmp_reg_src2, |
8845cb37 AS |
1903 | tmp_reg_src1)); |
1904 | else | |
37ae4739 | 1905 | emit_insn (gen_subfsi3_carry (sub_result, tmp_reg_src2, |
8845cb37 AS |
1906 | tmp_reg_src1)); |
1907 | } | |
1908 | } | |
1909 | ||
1910 | offset += cmp_bytes; | |
1911 | bytes -= cmp_bytes; | |
1912 | } | |
1913 | ||
37ae4739 AS |
1914 | if (convert_label) |
1915 | *p_convert_label = convert_label; | |
1916 | if (final_label) | |
1917 | *p_final_label = final_label; | |
1918 | return need_6432_conv; | |
1919 | } | |
1920 | ||
1921 | /* Expand a block compare operation, and return true if successful. | |
1922 | Return false if we should let the compiler generate normal code, | |
1923 | probably a memcmp call. | |
1924 | ||
1925 | OPERANDS[0] is the target (result). | |
1926 | OPERANDS[1] is the first source. | |
1927 | OPERANDS[2] is the second source. | |
1928 | OPERANDS[3] is the length. | |
1929 | OPERANDS[4] is the alignment. */ | |
1930 | bool | |
1931 | expand_block_compare (rtx operands[]) | |
1932 | { | |
1933 | rtx target = operands[0]; | |
1934 | rtx orig_src1 = operands[1]; | |
1935 | rtx orig_src2 = operands[2]; | |
1936 | rtx bytes_rtx = operands[3]; | |
1937 | rtx align_rtx = operands[4]; | |
1938 | ||
1939 | /* This case is complicated to handle because the subtract | |
1940 | with carry instructions do not generate the 64-bit | |
1941 | carry and so we must emit code to calculate it ourselves. | |
1942 | We choose not to implement this yet. */ | |
1943 | if (TARGET_32BIT && TARGET_POWERPC64) | |
1944 | return false; | |
1945 | ||
1946 | bool isP7 = (rs6000_tune == PROCESSOR_POWER7); | |
1947 | ||
1948 | /* Allow this param to shut off all expansion. */ | |
1949 | if (rs6000_block_compare_inline_limit == 0) | |
1950 | return false; | |
1951 | ||
1952 | /* targetm.slow_unaligned_access -- don't do unaligned stuff. | |
1953 | However slow_unaligned_access returns true on P7 even though the | |
1954 | performance of this code is good there. */ | |
1955 | if (!isP7 | |
1956 | && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1)) | |
1957 | || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2)))) | |
1958 | return false; | |
1959 | ||
1960 | /* Unaligned l*brx traps on P7 so don't do this. However this should | |
1961 | not affect much because LE isn't really supported on P7 anyway. */ | |
1962 | if (isP7 && !BYTES_BIG_ENDIAN) | |
1963 | return false; | |
1964 | ||
1965 | /* If this is not a fixed size compare, try generating loop code and | |
1966 | if that fails just call memcmp. */ | |
1967 | if (!CONST_INT_P (bytes_rtx)) | |
1968 | return expand_compare_loop (operands); | |
1969 | ||
1970 | /* This must be a fixed size alignment. */ | |
1971 | if (!CONST_INT_P (align_rtx)) | |
1972 | return false; | |
1973 | ||
1974 | unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT; | |
1975 | ||
1976 | gcc_assert (GET_MODE (target) == SImode); | |
1977 | ||
1978 | /* Anything to move? */ | |
1979 | unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx); | |
1980 | if (bytes == 0) | |
1981 | return true; | |
1982 | ||
1983 | /* P7/P8 code uses cond for subfc. but P9 uses | |
1984 | it for cmpld which needs CCUNSmode. */ | |
1985 | rtx cond = NULL; | |
1986 | if (TARGET_P9_MISC) | |
1987 | cond = gen_reg_rtx (CCUNSmode); | |
1988 | ||
1989 | /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at | |
1990 | least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is | |
1991 | at least POWER8. That way we can rely on overlapping compares to | |
1992 | do the final comparison of less than 16 bytes. Also I do not | |
1993 | want to deal with making this work for 32 bits. In addition, we | |
1994 | have to make sure that we have at least P8_VECTOR (we don't allow | |
1995 | P9_VECTOR without P8_VECTOR). */ | |
1996 | int use_vec = (bytes >= 33 && !TARGET_32BIT | |
1997 | && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR); | |
1998 | ||
1999 | /* We don't want to generate too much code. The loop code can take | |
2000 | over for lengths greater than 31 bytes. */ | |
2001 | unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit; | |
2002 | ||
2003 | /* Don't generate too much code if vsx was disabled. */ | |
2004 | if (!use_vec && max_bytes > 1) | |
2005 | max_bytes = ((max_bytes + 1) / 2) - 1; | |
2006 | ||
2007 | if (!IN_RANGE (bytes, 1, max_bytes)) | |
2008 | return expand_compare_loop (operands); | |
2009 | ||
2010 | /* The code generated for p7 and older is not faster than glibc | |
2011 | memcmp if alignment is small and length is not short, so bail | |
2012 | out to avoid those conditions. */ | |
2013 | if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED | |
2014 | && ((base_align == 1 && bytes > 16) | |
2015 | || (base_align == 2 && bytes > 32))) | |
2016 | return false; | |
2017 | ||
2018 | rtx final_label = NULL; | |
2019 | ||
2020 | if (use_vec) | |
8845cb37 | 2021 | { |
37ae4739 AS |
2022 | rtx final_move_label = gen_label_rtx (); |
2023 | rtx s1addr = gen_reg_rtx (Pmode); | |
2024 | rtx s2addr = gen_reg_rtx (Pmode); | |
2025 | rtx off_reg = gen_reg_rtx (Pmode); | |
2026 | rtx cleanup_label = NULL; | |
2027 | rtx vec_result = gen_reg_rtx (V16QImode); | |
2028 | rtx s1data = gen_reg_rtx (V16QImode); | |
2029 | rtx s2data = gen_reg_rtx (V16QImode); | |
2030 | rtx result_reg = gen_reg_rtx (word_mode); | |
2031 | emit_move_insn (result_reg, GEN_INT (0)); | |
8845cb37 | 2032 | |
37ae4739 AS |
2033 | expand_cmp_vec_sequence (bytes, orig_src1, orig_src2, |
2034 | s1addr, s2addr, off_reg, s1data, s2data, | |
2035 | vec_result, false, | |
2036 | &cleanup_label, final_move_label, false); | |
2037 | ||
2038 | if (cleanup_label) | |
2039 | emit_label (cleanup_label); | |
2040 | ||
2041 | emit_insn (gen_one_cmplv16qi2 (vec_result, vec_result)); | |
2042 | ||
2043 | emit_final_compare_vec (s1data, s2data, result_reg, | |
2044 | s1addr, s2addr, orig_src1, orig_src2, | |
2045 | off_reg, vec_result); | |
2046 | ||
2047 | emit_label (final_move_label); | |
2048 | emit_insn (gen_movsi (target, | |
2049 | gen_lowpart (SImode, result_reg))); | |
2050 | } | |
2051 | else | |
2052 | { /* generate GPR code */ | |
2053 | ||
2054 | rtx convert_label = NULL; | |
2055 | rtx sub_result = gen_reg_rtx (word_mode); | |
2056 | bool need_6432_conversion = | |
2057 | expand_block_compare_gpr(bytes, base_align, | |
2058 | orig_src1, orig_src2, | |
2059 | sub_result, cond, target, | |
2060 | &convert_label, &final_label); | |
2061 | ||
2062 | if (need_6432_conversion) | |
8845cb37 | 2063 | { |
37ae4739 AS |
2064 | if (convert_label) |
2065 | emit_label (convert_label); | |
2066 | if (TARGET_P9_MISC) | |
2067 | emit_insn (gen_setb_unsigned (target, cond)); | |
8845cb37 | 2068 | else |
37ae4739 | 2069 | generate_6432_conversion(target, sub_result); |
8845cb37 AS |
2070 | } |
2071 | } | |
2072 | ||
2073 | if (final_label) | |
2074 | emit_label (final_label); | |
2075 | ||
8845cb37 AS |
2076 | return true; |
2077 | } | |
2078 | ||
f7e94dfb | 2079 | /* Generate page crossing check and branch code to set up for |
8845cb37 AS |
2080 | strncmp when we don't have DI alignment. |
2081 | STRNCMP_LABEL is the label to branch if there is a page crossing. | |
f7e94dfb | 2082 | SRC_ADDR is the string address to be examined. |
8845cb37 AS |
2083 | BYTES is the max number of bytes to compare. */ |
2084 | static void | |
f7e94dfb | 2085 | expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes) |
8845cb37 AS |
2086 | { |
2087 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label); | |
f7e94dfb AS |
2088 | rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr)); |
2089 | do_and3 (src_pgoff, src_addr, GEN_INT (0xfff)); | |
8845cb37 | 2090 | rtx cond = gen_reg_rtx (CCmode); |
f7e94dfb | 2091 | emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff, |
8845cb37 AS |
2092 | GEN_INT (4096 - bytes))); |
2093 | ||
0c791c59 | 2094 | rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx); |
8845cb37 AS |
2095 | |
2096 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
0c791c59 | 2097 | lab_ref, pc_rtx); |
8845cb37 AS |
2098 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); |
2099 | JUMP_LABEL (j) = strncmp_label; | |
2100 | LABEL_NUSES (strncmp_label) += 1; | |
2101 | } | |
2102 | ||
74f9986e AS |
2103 | /* Generate the sequence of compares for strcmp/strncmp using gpr instructions. |
2104 | BYTES_TO_COMPARE is the number of bytes to be compared. | |
2105 | BASE_ALIGN is the smaller of the alignment of the two strings. | |
2106 | ORIG_SRC1 is the unmodified rtx for the first string. | |
2107 | ORIG_SRC2 is the unmodified rtx for the second string. | |
2108 | TMP_REG_SRC1 is the register for loading the first string. | |
2109 | TMP_REG_SRC2 is the register for loading the second string. | |
2110 | RESULT_REG is the rtx for the result register. | |
2111 | EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call | |
2112 | to strcmp/strncmp if we have equality at the end of the inline comparison. | |
9d36bd3b AS |
2113 | P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code |
2114 | to clean up and generate the final comparison result. | |
ef4adf1f | 2115 | FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just |
74f9986e AS |
2116 | set the final result. */ |
2117 | static void | |
9d36bd3b AS |
2118 | expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, |
2119 | unsigned int base_align, | |
2120 | rtx orig_src1, rtx orig_src2, | |
2121 | rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg, | |
2122 | bool equality_compare_rest, rtx *p_cleanup_label, | |
2123 | rtx final_move_label) | |
74f9986e AS |
2124 | { |
2125 | unsigned int word_mode_size = GET_MODE_SIZE (word_mode); | |
2126 | machine_mode load_mode; | |
2127 | unsigned int load_mode_size; | |
2128 | unsigned HOST_WIDE_INT cmp_bytes = 0; | |
2129 | unsigned HOST_WIDE_INT offset = 0; | |
2130 | rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0)); | |
2131 | rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0)); | |
9d36bd3b AS |
2132 | gcc_assert (p_cleanup_label != NULL); |
2133 | rtx cleanup_label = *p_cleanup_label; | |
74f9986e AS |
2134 | |
2135 | while (bytes_to_compare > 0) | |
2136 | { | |
2137 | /* GPR compare sequence: | |
ef4adf1f AS |
2138 | check each 8B with: ld/ld/cmpb/cmpb/orc./bne |
2139 | ||
74f9986e | 2140 | cleanup code at end: |
74f9986e AS |
2141 | cntlzd get bit of first zero/diff byte |
2142 | subfic convert for rldcl use | |
2143 | rldcl rldcl extract diff/zero byte | |
2144 | subf subtract for final result | |
2145 | ||
2146 | The last compare can branch around the cleanup code if the | |
2147 | result is zero because the strings are exactly equal. */ | |
ef4adf1f | 2148 | |
74f9986e AS |
2149 | unsigned int align = compute_current_alignment (base_align, offset); |
2150 | load_mode = select_block_compare_mode (offset, bytes_to_compare, align); | |
2151 | load_mode_size = GET_MODE_SIZE (load_mode); | |
2152 | if (bytes_to_compare >= load_mode_size) | |
2153 | cmp_bytes = load_mode_size; | |
2154 | else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
2155 | { | |
2156 | /* Move this load back so it doesn't go past the end. | |
2157 | P8/P9 can do this efficiently. */ | |
2158 | unsigned int extra_bytes = load_mode_size - bytes_to_compare; | |
2159 | cmp_bytes = bytes_to_compare; | |
2160 | if (extra_bytes < offset) | |
2161 | { | |
2162 | offset -= extra_bytes; | |
2163 | cmp_bytes = load_mode_size; | |
2164 | bytes_to_compare = cmp_bytes; | |
2165 | } | |
2166 | } | |
2167 | else | |
2168 | /* P7 and earlier can't do the overlapping load trick fast, | |
2169 | so this forces a non-overlapping load and a shift to get | |
2170 | rid of the extra bytes. */ | |
2171 | cmp_bytes = bytes_to_compare; | |
2172 | ||
122d6c36 AS |
2173 | rtx offset_rtx; |
2174 | if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM) | |
2175 | offset_rtx = GEN_INT (offset); | |
2176 | else | |
2177 | { | |
2178 | offset_rtx = gen_reg_rtx (Pmode); | |
2179 | emit_move_insn (offset_rtx, GEN_INT (offset)); | |
2180 | } | |
2181 | rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx); | |
2182 | rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx); | |
37ae4739 | 2183 | |
74f9986e | 2184 | do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1); |
74f9986e AS |
2185 | do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2); |
2186 | ||
2187 | /* We must always left-align the data we read, and | |
2188 | clear any bytes to the right that are beyond the string. | |
2189 | Otherwise the cmpb sequence won't produce the correct | |
ef4adf1f AS |
2190 | results. However if there is only one byte left, we |
2191 | can just subtract to get the final result so the shifts | |
2192 | and clears are not needed. */ | |
74f9986e | 2193 | |
ef4adf1f | 2194 | unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; |
74f9986e | 2195 | |
ef4adf1f AS |
2196 | /* Loading just a single byte is a special case. If we are |
2197 | loading more than that, we have to check whether we are | |
2198 | looking at the entire chunk of data. If not, rotate left and | |
2199 | clear right so that bytes we aren't supposed to look at are | |
2200 | zeroed, and the first byte we are supposed to compare is | |
2201 | leftmost. */ | |
2202 | if (load_mode_size != 1) | |
74f9986e | 2203 | { |
ef4adf1f AS |
2204 | if (load_mode_size < word_mode_size) |
2205 | { | |
2206 | /* Rotate left first. */ | |
2207 | rtx sh = GEN_INT (BITS_PER_UNIT | |
2208 | * (word_mode_size - load_mode_size)); | |
2209 | do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh); | |
2210 | do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh); | |
2211 | } | |
2212 | ||
2213 | if (cmp_bytes < word_mode_size) | |
2214 | { | |
2215 | /* Now clear right. This plus the rotate can be | |
2216 | turned into a rldicr instruction. */ | |
2217 | HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes); | |
2218 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); | |
2219 | do_and3 (tmp_reg_src1, tmp_reg_src1, mask); | |
2220 | do_and3 (tmp_reg_src2, tmp_reg_src2, mask); | |
2221 | } | |
74f9986e AS |
2222 | } |
2223 | ||
2224 | /* Cases to handle. A and B are chunks of the two strings. | |
2225 | 1: Not end of comparison: | |
2226 | A != B: branch to cleanup code to compute result. | |
2227 | A == B: check for 0 byte, next block if not found. | |
2228 | 2: End of the inline comparison: | |
2229 | A != B: branch to cleanup code to compute result. | |
2230 | A == B: check for 0 byte, call strcmp/strncmp | |
2231 | 3: compared requested N bytes: | |
2232 | A == B: branch to result 0. | |
2233 | A != B: cleanup code to compute result. */ | |
2234 | ||
74f9986e AS |
2235 | rtx dst_label; |
2236 | if (remain > 0 || equality_compare_rest) | |
2237 | { | |
2238 | /* Branch to cleanup code, otherwise fall through to do | |
2239 | more compares. */ | |
2240 | if (!cleanup_label) | |
2241 | cleanup_label = gen_label_rtx (); | |
2242 | dst_label = cleanup_label; | |
2243 | } | |
2244 | else | |
2245 | /* Branch to end and produce result of 0. */ | |
2246 | dst_label = final_move_label; | |
2247 | ||
ef4adf1f AS |
2248 | if (load_mode_size == 1) |
2249 | { | |
2250 | /* Special case for comparing just single byte. */ | |
2251 | if (equality_compare_rest) | |
2252 | { | |
2253 | /* Use subf./bne to branch to final_move_label if the | |
2254 | byte differs, otherwise fall through to the strncmp | |
2255 | call. We must also check for a zero byte here as we | |
2256 | must not make the library call if this is the end of | |
2257 | the string. */ | |
2258 | ||
2259 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); | |
2260 | rtx cond = gen_reg_rtx (CCmode); | |
2261 | rtx diff_rtx = gen_rtx_MINUS (word_mode, | |
2262 | tmp_reg_src1, tmp_reg_src2); | |
2263 | rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond); | |
2264 | rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
2265 | ||
2266 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
2267 | lab_ref, pc_rtx); | |
2268 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
2269 | JUMP_LABEL (j) = final_move_label; | |
2270 | LABEL_NUSES (final_move_label) += 1; | |
74f9986e | 2271 | |
ef4adf1f AS |
2272 | /* Check for zero byte here before fall through to |
2273 | library call. This catches the case where the | |
2274 | strings are equal and end in a zero byte at this | |
2275 | position. */ | |
74f9986e | 2276 | |
ef4adf1f AS |
2277 | rtx cond0 = gen_reg_rtx (CCmode); |
2278 | emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1, | |
2279 | const0_rtx)); | |
74f9986e | 2280 | |
ef4adf1f | 2281 | rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx); |
74f9986e | 2282 | |
ef4adf1f AS |
2283 | rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx, |
2284 | lab_ref, pc_rtx); | |
2285 | rtx j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0)); | |
2286 | JUMP_LABEL (j0) = final_move_label; | |
2287 | LABEL_NUSES (final_move_label) += 1; | |
2288 | } | |
2289 | else | |
2290 | { | |
2291 | /* This is the last byte to be compared so we can use | |
2292 | subf to compute the final result and branch | |
2293 | unconditionally to final_move_label. */ | |
2294 | ||
2295 | do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2); | |
2296 | ||
2297 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); | |
2298 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
2299 | JUMP_LABEL (j) = final_move_label; | |
2300 | LABEL_NUSES (final_move_label) += 1; | |
2301 | emit_barrier (); | |
2302 | } | |
2303 | } | |
2304 | else | |
74f9986e | 2305 | { |
74f9986e | 2306 | rtx cmpb_zero = gen_reg_rtx (word_mode); |
ef4adf1f | 2307 | rtx cmpb_diff = gen_reg_rtx (word_mode); |
74f9986e | 2308 | rtx zero_reg = gen_reg_rtx (word_mode); |
ef4adf1f AS |
2309 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); |
2310 | rtx cond = gen_reg_rtx (CCmode); | |
2311 | ||
74f9986e | 2312 | emit_move_insn (zero_reg, GEN_INT (0)); |
ef4adf1f | 2313 | do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2); |
74f9986e | 2314 | do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg); |
ef4adf1f AS |
2315 | rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff); |
2316 | rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero); | |
74f9986e | 2317 | |
ef4adf1f | 2318 | rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond); |
74f9986e | 2319 | |
ef4adf1f AS |
2320 | rtx cmp_rtx; |
2321 | if (remain == 0 && !equality_compare_rest) | |
2322 | cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx); | |
2323 | else | |
2324 | cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
74f9986e | 2325 | |
ef4adf1f AS |
2326 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, |
2327 | lab_ref, pc_rtx); | |
2328 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
2329 | JUMP_LABEL (j) = dst_label; | |
2330 | LABEL_NUSES (dst_label) += 1; | |
74f9986e AS |
2331 | } |
2332 | ||
2333 | offset += cmp_bytes; | |
2334 | bytes_to_compare -= cmp_bytes; | |
2335 | } | |
2336 | ||
9d36bd3b AS |
2337 | *p_cleanup_label = cleanup_label; |
2338 | return; | |
2339 | } | |
2340 | ||
f7e94dfb AS |
2341 | /* Generate the final sequence that identifies the differing |
2342 | byte and generates the final result, taking into account | |
2343 | zero bytes: | |
ef4adf1f | 2344 | |
f7e94dfb AS |
2345 | cntlzd get bit of first zero/diff byte |
2346 | addi convert for rldcl use | |
2347 | rldcl rldcl extract diff/zero byte | |
2348 | subf subtract for final result | |
2349 | ||
2350 | STR1 is the reg rtx for data from string 1. | |
2351 | STR2 is the reg rtx for data from string 2. | |
2352 | RESULT is the reg rtx for the comparison result. */ | |
2353 | ||
2354 | static void | |
2355 | emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result) | |
2356 | { | |
2357 | machine_mode m = GET_MODE (str1); | |
f7e94dfb | 2358 | rtx rot_amt = gen_reg_rtx (m); |
f7e94dfb AS |
2359 | |
2360 | rtx rot1_1 = gen_reg_rtx (m); | |
2361 | rtx rot1_2 = gen_reg_rtx (m); | |
2362 | rtx rot2_1 = gen_reg_rtx (m); | |
2363 | rtx rot2_2 = gen_reg_rtx (m); | |
2364 | ||
2365 | if (m == SImode) | |
2366 | { | |
ef4adf1f | 2367 | emit_insn (gen_clzsi2 (rot_amt, result)); |
f7e94dfb AS |
2368 | emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8))); |
2369 | emit_insn (gen_rotlsi3 (rot1_1, str1, | |
2370 | gen_lowpart (SImode, rot_amt))); | |
2371 | emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); | |
2372 | emit_insn (gen_rotlsi3 (rot2_1, str2, | |
2373 | gen_lowpart (SImode, rot_amt))); | |
2374 | emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); | |
2375 | emit_insn (gen_subsi3 (result, rot1_2, rot2_2)); | |
2376 | } | |
2377 | else if (m == DImode) | |
2378 | { | |
ef4adf1f | 2379 | emit_insn (gen_clzdi2 (rot_amt, result)); |
f7e94dfb AS |
2380 | emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8))); |
2381 | emit_insn (gen_rotldi3 (rot1_1, str1, | |
2382 | gen_lowpart (SImode, rot_amt))); | |
2383 | emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); | |
2384 | emit_insn (gen_rotldi3 (rot2_1, str2, | |
2385 | gen_lowpart (SImode, rot_amt))); | |
2386 | emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); | |
2387 | emit_insn (gen_subdi3 (result, rot1_2, rot2_2)); | |
2388 | } | |
2389 | else | |
2390 | gcc_unreachable (); | |
ef4adf1f | 2391 | |
f7e94dfb AS |
2392 | return; |
2393 | } | |
2394 | ||
8845cb37 | 2395 | /* Expand a string compare operation with length, and return |
ef4adf1f | 2396 | true if successful. Return false if we should let the |
8845cb37 AS |
2397 | compiler generate normal code, probably a strncmp call. |
2398 | ||
2399 | OPERANDS[0] is the target (result). | |
2400 | OPERANDS[1] is the first source. | |
2401 | OPERANDS[2] is the second source. | |
2402 | If NO_LENGTH is zero, then: | |
2403 | OPERANDS[3] is the length. | |
2404 | OPERANDS[4] is the alignment in bytes. | |
2405 | If NO_LENGTH is nonzero, then: | |
2406 | OPERANDS[3] is the alignment in bytes. */ | |
2407 | bool | |
2408 | expand_strn_compare (rtx operands[], int no_length) | |
2409 | { | |
2410 | rtx target = operands[0]; | |
2411 | rtx orig_src1 = operands[1]; | |
2412 | rtx orig_src2 = operands[2]; | |
2413 | rtx bytes_rtx, align_rtx; | |
2414 | if (no_length) | |
2415 | { | |
2416 | bytes_rtx = NULL; | |
2417 | align_rtx = operands[3]; | |
2418 | } | |
2419 | else | |
2420 | { | |
2421 | bytes_rtx = operands[3]; | |
2422 | align_rtx = operands[4]; | |
2423 | } | |
74f9986e | 2424 | |
f7e94dfb AS |
2425 | rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0)); |
2426 | rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0)); | |
8845cb37 | 2427 | |
ef4adf1f | 2428 | /* If we have a length, it must be constant. This simplifies things |
8845cb37 | 2429 | a bit as we don't have to generate code to check if we've exceeded |
ef4adf1f | 2430 | the length. Later this could be expanded to handle this case. */ |
8845cb37 AS |
2431 | if (!no_length && !CONST_INT_P (bytes_rtx)) |
2432 | return false; | |
2433 | ||
2434 | /* This must be a fixed size alignment. */ | |
2435 | if (!CONST_INT_P (align_rtx)) | |
2436 | return false; | |
2437 | ||
2438 | unsigned int base_align = UINTVAL (align_rtx); | |
f7e94dfb AS |
2439 | unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; |
2440 | unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; | |
8845cb37 | 2441 | |
e0bd6c9f RS |
2442 | /* targetm.slow_unaligned_access -- don't do unaligned stuff. */ |
2443 | if (targetm.slow_unaligned_access (word_mode, align1) | |
2444 | || targetm.slow_unaligned_access (word_mode, align2)) | |
8845cb37 AS |
2445 | return false; |
2446 | ||
2447 | gcc_assert (GET_MODE (target) == SImode); | |
2448 | ||
9d36bd3b | 2449 | unsigned int required_align = 8; |
8845cb37 AS |
2450 | |
2451 | unsigned HOST_WIDE_INT offset = 0; | |
2452 | unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */ | |
2453 | unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */ | |
9d36bd3b | 2454 | |
8845cb37 | 2455 | if (no_length) |
9d36bd3b | 2456 | bytes = rs6000_string_compare_inline_limit; |
8845cb37 AS |
2457 | else |
2458 | bytes = UINTVAL (bytes_rtx); | |
2459 | ||
ef4adf1f | 2460 | /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at |
9d36bd3b AS |
2461 | least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is |
2462 | at least POWER8. That way we can rely on overlapping compares to | |
6bd2b8ec AS |
2463 | do the final comparison of less than 16 bytes. Also I do not |
2464 | want to deal with making this work for 32 bits. In addition, we | |
2465 | have to make sure that we have at least P8_VECTOR (we don't allow | |
2466 | P9_VECTOR without P8_VECTOR). */ | |
2467 | int use_vec = (bytes >= 16 && !TARGET_32BIT | |
2468 | && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR); | |
9d36bd3b AS |
2469 | |
2470 | if (use_vec) | |
2471 | required_align = 16; | |
2472 | ||
2473 | machine_mode load_mode; | |
2474 | rtx tmp_reg_src1, tmp_reg_src2; | |
2475 | if (use_vec) | |
2476 | { | |
2477 | load_mode = V16QImode; | |
2478 | tmp_reg_src1 = gen_reg_rtx (V16QImode); | |
2479 | tmp_reg_src2 = gen_reg_rtx (V16QImode); | |
2480 | } | |
2481 | else | |
2482 | { | |
2483 | load_mode = select_block_compare_mode (0, bytes, base_align); | |
2484 | tmp_reg_src1 = gen_reg_rtx (word_mode); | |
2485 | tmp_reg_src2 = gen_reg_rtx (word_mode); | |
2486 | } | |
2487 | ||
2488 | compare_length = rs6000_string_compare_inline_limit; | |
8845cb37 AS |
2489 | |
2490 | /* If we have equality at the end of the last compare and we have not | |
2491 | found the end of the string, we need to call strcmp/strncmp to | |
2492 | compare the remainder. */ | |
2493 | bool equality_compare_rest = false; | |
2494 | ||
2495 | if (no_length) | |
2496 | { | |
2497 | bytes = compare_length; | |
2498 | equality_compare_rest = true; | |
2499 | } | |
2500 | else | |
2501 | { | |
2502 | if (bytes <= compare_length) | |
2503 | compare_length = bytes; | |
2504 | else | |
2505 | equality_compare_rest = true; | |
2506 | } | |
2507 | ||
2508 | rtx result_reg = gen_reg_rtx (word_mode); | |
2509 | rtx final_move_label = gen_label_rtx (); | |
2510 | rtx final_label = gen_label_rtx (); | |
2511 | rtx begin_compare_label = NULL; | |
ef4adf1f | 2512 | |
f7e94dfb | 2513 | if (base_align < required_align) |
8845cb37 AS |
2514 | { |
2515 | /* Generate code that checks distance to 4k boundary for this case. */ | |
2516 | begin_compare_label = gen_label_rtx (); | |
2517 | rtx strncmp_label = gen_label_rtx (); | |
2518 | rtx jmp; | |
2519 | ||
2520 | /* Strncmp for power8 in glibc does this: | |
5ec3397e AS |
2521 | rldicl r8,r3,0,52 |
2522 | cmpldi cr7,r8,4096-16 | |
2523 | bgt cr7,L(pagecross) */ | |
8845cb37 AS |
2524 | |
2525 | /* Make sure that the length we use for the alignment test and | |
2526 | the subsequent code generation are in agreement so we do not | |
2527 | go past the length we tested for a 4k boundary crossing. */ | |
2528 | unsigned HOST_WIDE_INT align_test = compare_length; | |
9d36bd3b | 2529 | if (align_test < required_align) |
8845cb37 AS |
2530 | { |
2531 | align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test); | |
2532 | base_align = align_test; | |
2533 | } | |
2534 | else | |
2535 | { | |
f7e94dfb AS |
2536 | align_test = ROUND_UP (align_test, required_align); |
2537 | base_align = required_align; | |
8845cb37 AS |
2538 | } |
2539 | ||
f7e94dfb AS |
2540 | if (align1 < required_align) |
2541 | expand_strncmp_align_check (strncmp_label, src1_addr, align_test); | |
2542 | if (align2 < required_align) | |
2543 | expand_strncmp_align_check (strncmp_label, src2_addr, align_test); | |
8845cb37 AS |
2544 | |
2545 | /* Now generate the following sequence: | |
2546 | - branch to begin_compare | |
2547 | - strncmp_label | |
2548 | - call to strncmp | |
2549 | - branch to final_label | |
2550 | - begin_compare_label */ | |
2551 | ||
2552 | rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label); | |
2553 | jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref)); | |
2554 | JUMP_LABEL (jmp) = begin_compare_label; | |
2555 | LABEL_NUSES (begin_compare_label) += 1; | |
2556 | emit_barrier (); | |
2557 | ||
2558 | emit_label (strncmp_label); | |
2559 | ||
8845cb37 AS |
2560 | if (no_length) |
2561 | { | |
2562 | tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); | |
2563 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2564 | target, LCT_NORMAL, GET_MODE (target), |
f7e94dfb AS |
2565 | force_reg (Pmode, src1_addr), Pmode, |
2566 | force_reg (Pmode, src2_addr), Pmode); | |
8845cb37 AS |
2567 | } |
2568 | else | |
2569 | { | |
2570 | /* -m32 -mpowerpc64 results in word_mode being DImode even | |
9d36bd3b | 2571 | though otherwise it is 32-bit. The length arg to strncmp |
8845cb37 | 2572 | is a size_t which will be the same size as pointers. */ |
e9727bda AS |
2573 | rtx len_rtx = gen_reg_rtx (Pmode); |
2574 | emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode)); | |
8845cb37 AS |
2575 | |
2576 | tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); | |
2577 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2578 | target, LCT_NORMAL, GET_MODE (target), |
f7e94dfb AS |
2579 | force_reg (Pmode, src1_addr), Pmode, |
2580 | force_reg (Pmode, src2_addr), Pmode, | |
e9727bda | 2581 | len_rtx, Pmode); |
8845cb37 AS |
2582 | } |
2583 | ||
2584 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
2585 | jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
2586 | JUMP_LABEL (jmp) = final_label; | |
2587 | LABEL_NUSES (final_label) += 1; | |
2588 | emit_barrier (); | |
2589 | emit_label (begin_compare_label); | |
2590 | } | |
2591 | ||
2592 | rtx cleanup_label = NULL; | |
9d36bd3b | 2593 | rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL; |
8845cb37 | 2594 | |
f7e94dfb | 2595 | /* Generate a sequence of GPR or VEC/VSX instructions to compare out |
8845cb37 | 2596 | to the length specified. */ |
9d36bd3b AS |
2597 | if (use_vec) |
2598 | { | |
2599 | s1addr = gen_reg_rtx (Pmode); | |
2600 | s2addr = gen_reg_rtx (Pmode); | |
2601 | off_reg = gen_reg_rtx (Pmode); | |
2602 | vec_result = gen_reg_rtx (load_mode); | |
2603 | emit_move_insn (result_reg, GEN_INT (0)); | |
37ae4739 AS |
2604 | expand_cmp_vec_sequence (compare_length, |
2605 | orig_src1, orig_src2, | |
2606 | s1addr, s2addr, off_reg, | |
2607 | tmp_reg_src1, tmp_reg_src2, | |
2608 | vec_result, | |
2609 | equality_compare_rest, | |
2610 | &cleanup_label, final_move_label, true); | |
9d36bd3b AS |
2611 | } |
2612 | else | |
2613 | expand_strncmp_gpr_sequence (compare_length, base_align, | |
2614 | orig_src1, orig_src2, | |
2615 | tmp_reg_src1, tmp_reg_src2, | |
2616 | result_reg, | |
2617 | equality_compare_rest, | |
2618 | &cleanup_label, final_move_label); | |
74f9986e AS |
2619 | |
2620 | offset = compare_length; | |
ef4adf1f | 2621 | |
8845cb37 AS |
2622 | if (equality_compare_rest) |
2623 | { | |
2624 | /* Update pointers past what has been compared already. */ | |
f7e94dfb AS |
2625 | rtx src1 = force_reg (Pmode, |
2626 | gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset))); | |
2627 | rtx src2 = force_reg (Pmode, | |
2628 | gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset))); | |
8845cb37 AS |
2629 | |
2630 | /* Construct call to strcmp/strncmp to compare the rest of the string. */ | |
2631 | if (no_length) | |
2632 | { | |
2633 | tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); | |
2634 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2635 | target, LCT_NORMAL, GET_MODE (target), |
f7e94dfb | 2636 | src1, Pmode, src2, Pmode); |
8845cb37 AS |
2637 | } |
2638 | else | |
2639 | { | |
e9727bda AS |
2640 | rtx len_rtx = gen_reg_rtx (Pmode); |
2641 | emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode)); | |
8845cb37 AS |
2642 | tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); |
2643 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2644 | target, LCT_NORMAL, GET_MODE (target), |
e9727bda | 2645 | src1, Pmode, src2, Pmode, len_rtx, Pmode); |
8845cb37 AS |
2646 | } |
2647 | ||
2648 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
2649 | rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
2650 | JUMP_LABEL (jmp) = final_label; | |
2651 | LABEL_NUSES (final_label) += 1; | |
2652 | emit_barrier (); | |
2653 | } | |
2654 | ||
2655 | if (cleanup_label) | |
2656 | emit_label (cleanup_label); | |
2657 | ||
9d36bd3b | 2658 | if (use_vec) |
37ae4739 AS |
2659 | emit_final_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg, |
2660 | s1addr, s2addr, orig_src1, orig_src2, | |
2661 | off_reg, vec_result); | |
9d36bd3b AS |
2662 | else |
2663 | emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg); | |
8845cb37 AS |
2664 | |
2665 | emit_label (final_move_label); | |
2666 | emit_insn (gen_movsi (target, | |
2667 | gen_lowpart (SImode, result_reg))); | |
2668 | emit_label (final_label); | |
2669 | return true; | |
2670 | } | |
2671 | ||
19db0ebb AS |
2672 | /* Generate loads and stores for a move of v4si mode using lvx/stvx. |
2673 | This uses altivec_{l,st}vx_<mode>_internal which use unspecs to | |
2674 | keep combine from changing what instruction gets used. | |
2675 | ||
2676 | DEST is the destination for the data. | |
2677 | SRC is the source of the data for the move. */ | |
2678 | ||
2679 | static rtx | |
2680 | gen_lvx_v4si_move (rtx dest, rtx src) | |
2681 | { | |
2682 | gcc_assert (MEM_P (dest) ^ MEM_P (src)); | |
2683 | gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode); | |
2684 | ||
2685 | if (MEM_P (dest)) | |
2686 | return gen_altivec_stvx_v4si_internal (dest, src); | |
2687 | else | |
2688 | return gen_altivec_lvx_v4si_internal (dest, src); | |
2689 | } | |
2690 | ||
8845cb37 AS |
2691 | /* Expand a block move operation, and return 1 if successful. Return 0 |
2692 | if we should let the compiler generate normal code. | |
2693 | ||
2694 | operands[0] is the destination | |
2695 | operands[1] is the source | |
2696 | operands[2] is the length | |
2697 | operands[3] is the alignment */ | |
2698 | ||
2699 | #define MAX_MOVE_REG 4 | |
2700 | ||
2701 | int | |
2702 | expand_block_move (rtx operands[]) | |
2703 | { | |
2704 | rtx orig_dest = operands[0]; | |
2705 | rtx orig_src = operands[1]; | |
2706 | rtx bytes_rtx = operands[2]; | |
2707 | rtx align_rtx = operands[3]; | |
2e42a52f | 2708 | int constp = CONST_INT_P (bytes_rtx); |
8845cb37 AS |
2709 | int align; |
2710 | int bytes; | |
2711 | int offset; | |
2712 | int move_bytes; | |
2713 | rtx stores[MAX_MOVE_REG]; | |
2714 | int num_reg = 0; | |
2715 | ||
2716 | /* If this is not a fixed size move, just call memcpy */ | |
2717 | if (! constp) | |
2718 | return 0; | |
2719 | ||
2720 | /* This must be a fixed size alignment */ | |
2e42a52f | 2721 | gcc_assert (CONST_INT_P (align_rtx)); |
8845cb37 AS |
2722 | align = INTVAL (align_rtx) * BITS_PER_UNIT; |
2723 | ||
2724 | /* Anything to move? */ | |
2725 | bytes = INTVAL (bytes_rtx); | |
2726 | if (bytes <= 0) | |
2727 | return 1; | |
2728 | ||
2729 | if (bytes > rs6000_block_move_inline_limit) | |
2730 | return 0; | |
2731 | ||
2732 | for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes) | |
2733 | { | |
2734 | union { | |
2735 | rtx (*movmemsi) (rtx, rtx, rtx, rtx); | |
2736 | rtx (*mov) (rtx, rtx); | |
2737 | } gen_func; | |
2738 | machine_mode mode = BLKmode; | |
2739 | rtx src, dest; | |
2740 | ||
2741 | /* Altivec first, since it will be faster than a string move | |
2742 | when it applies, and usually not significantly larger. */ | |
19db0ebb | 2743 | if (TARGET_ALTIVEC && bytes >= 16 && align >= 128) |
8845cb37 AS |
2744 | { |
2745 | move_bytes = 16; | |
2746 | mode = V4SImode; | |
19db0ebb | 2747 | gen_func.mov = gen_lvx_v4si_move; |
8845cb37 | 2748 | } |
8845cb37 AS |
2749 | else if (bytes >= 8 && TARGET_POWERPC64 |
2750 | && (align >= 64 || !STRICT_ALIGNMENT)) | |
2751 | { | |
2752 | move_bytes = 8; | |
2753 | mode = DImode; | |
2754 | gen_func.mov = gen_movdi; | |
2755 | if (offset == 0 && align < 64) | |
2756 | { | |
2757 | rtx addr; | |
2758 | ||
2759 | /* If the address form is reg+offset with offset not a | |
2760 | multiple of four, reload into reg indirect form here | |
2761 | rather than waiting for reload. This way we get one | |
2762 | reload, not one per load and/or store. */ | |
2763 | addr = XEXP (orig_dest, 0); | |
2764 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
2e42a52f | 2765 | && CONST_INT_P (XEXP (addr, 1)) |
8845cb37 AS |
2766 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) |
2767 | { | |
2768 | addr = copy_addr_to_reg (addr); | |
2769 | orig_dest = replace_equiv_address (orig_dest, addr); | |
2770 | } | |
2771 | addr = XEXP (orig_src, 0); | |
2772 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
2e42a52f | 2773 | && CONST_INT_P (XEXP (addr, 1)) |
8845cb37 AS |
2774 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) |
2775 | { | |
2776 | addr = copy_addr_to_reg (addr); | |
2777 | orig_src = replace_equiv_address (orig_src, addr); | |
2778 | } | |
2779 | } | |
2780 | } | |
8845cb37 AS |
2781 | else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) |
2782 | { /* move 4 bytes */ | |
2783 | move_bytes = 4; | |
2784 | mode = SImode; | |
2785 | gen_func.mov = gen_movsi; | |
2786 | } | |
2787 | else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) | |
2788 | { /* move 2 bytes */ | |
2789 | move_bytes = 2; | |
2790 | mode = HImode; | |
2791 | gen_func.mov = gen_movhi; | |
2792 | } | |
8845cb37 AS |
2793 | else /* move 1 byte at a time */ |
2794 | { | |
2795 | move_bytes = 1; | |
2796 | mode = QImode; | |
2797 | gen_func.mov = gen_movqi; | |
2798 | } | |
2799 | ||
2800 | src = adjust_address (orig_src, mode, offset); | |
2801 | dest = adjust_address (orig_dest, mode, offset); | |
2802 | ||
2803 | if (mode != BLKmode) | |
2804 | { | |
2805 | rtx tmp_reg = gen_reg_rtx (mode); | |
2806 | ||
2807 | emit_insn ((*gen_func.mov) (tmp_reg, src)); | |
2808 | stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); | |
2809 | } | |
2810 | ||
2811 | if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes) | |
2812 | { | |
2813 | int i; | |
2814 | for (i = 0; i < num_reg; i++) | |
2815 | emit_insn (stores[i]); | |
2816 | num_reg = 0; | |
2817 | } | |
2818 | ||
2819 | if (mode == BLKmode) | |
2820 | { | |
2821 | /* Move the address into scratch registers. The movmemsi | |
2822 | patterns require zero offset. */ | |
2823 | if (!REG_P (XEXP (src, 0))) | |
2824 | { | |
2825 | rtx src_reg = copy_addr_to_reg (XEXP (src, 0)); | |
2826 | src = replace_equiv_address (src, src_reg); | |
2827 | } | |
2828 | set_mem_size (src, move_bytes); | |
2829 | ||
2830 | if (!REG_P (XEXP (dest, 0))) | |
2831 | { | |
2832 | rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0)); | |
2833 | dest = replace_equiv_address (dest, dest_reg); | |
2834 | } | |
2835 | set_mem_size (dest, move_bytes); | |
2836 | ||
2837 | emit_insn ((*gen_func.movmemsi) (dest, src, | |
2838 | GEN_INT (move_bytes & 31), | |
2839 | align_rtx)); | |
2840 | } | |
2841 | } | |
2842 | ||
2843 | return 1; | |
2844 | } |