]>
Commit | Line | Data |
---|---|---|
8845cb37 AS |
1 | /* Subroutines used to expand string and block move, clear, |
2 | compare and other operations for PowerPC. | |
85ec4feb | 3 | Copyright (C) 1991-2018 Free Software Foundation, Inc. |
8845cb37 AS |
4 | |
5 | This file is part of GCC. | |
6 | ||
7 | GCC is free software; you can redistribute it and/or modify it | |
8 | under the terms of the GNU General Public License as published | |
9 | by the Free Software Foundation; either version 3, or (at your | |
10 | option) any later version. | |
11 | ||
12 | GCC is distributed in the hope that it will be useful, but WITHOUT | |
13 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
14 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | |
15 | License for more details. | |
16 | ||
17 | You should have received a copy of the GNU General Public License | |
18 | along with GCC; see the file COPYING3. If not see | |
19 | <http://www.gnu.org/licenses/>. */ | |
20 | ||
8fcc61f8 RS |
21 | #define IN_TARGET_CODE 1 |
22 | ||
8845cb37 AS |
23 | #include "config.h" |
24 | #include "system.h" | |
25 | #include "coretypes.h" | |
26 | #include "backend.h" | |
27 | #include "rtl.h" | |
28 | #include "tree.h" | |
29 | #include "memmodel.h" | |
30 | #include "tm_p.h" | |
31 | #include "ira.h" | |
32 | #include "print-tree.h" | |
33 | #include "varasm.h" | |
34 | #include "explow.h" | |
35 | #include "expr.h" | |
36 | #include "output.h" | |
e0bd6c9f | 37 | #include "target.h" |
8845cb37 AS |
38 | |
39 | /* Expand a block clear operation, and return 1 if successful. Return 0 | |
40 | if we should let the compiler generate normal code. | |
41 | ||
42 | operands[0] is the destination | |
43 | operands[1] is the length | |
44 | operands[3] is the alignment */ | |
45 | ||
46 | int | |
47 | expand_block_clear (rtx operands[]) | |
48 | { | |
49 | rtx orig_dest = operands[0]; | |
50 | rtx bytes_rtx = operands[1]; | |
51 | rtx align_rtx = operands[3]; | |
52 | bool constp = (GET_CODE (bytes_rtx) == CONST_INT); | |
53 | HOST_WIDE_INT align; | |
54 | HOST_WIDE_INT bytes; | |
55 | int offset; | |
56 | int clear_bytes; | |
57 | int clear_step; | |
58 | ||
59 | /* If this is not a fixed size move, just call memcpy */ | |
60 | if (! constp) | |
61 | return 0; | |
62 | ||
63 | /* This must be a fixed size alignment */ | |
64 | gcc_assert (GET_CODE (align_rtx) == CONST_INT); | |
65 | align = INTVAL (align_rtx) * BITS_PER_UNIT; | |
66 | ||
67 | /* Anything to clear? */ | |
68 | bytes = INTVAL (bytes_rtx); | |
69 | if (bytes <= 0) | |
70 | return 1; | |
71 | ||
72 | /* Use the builtin memset after a point, to avoid huge code bloat. | |
73 | When optimize_size, avoid any significant code bloat; calling | |
74 | memset is about 4 instructions, so allow for one instruction to | |
75 | load zero and three to do clearing. */ | |
3b0cb1a5 | 76 | if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX)) |
8845cb37 AS |
77 | clear_step = 16; |
78 | else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT)) | |
79 | clear_step = 8; | |
80 | else | |
81 | clear_step = 4; | |
82 | ||
83 | if (optimize_size && bytes > 3 * clear_step) | |
84 | return 0; | |
85 | if (! optimize_size && bytes > 8 * clear_step) | |
86 | return 0; | |
87 | ||
88 | for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes) | |
89 | { | |
90 | machine_mode mode = BLKmode; | |
91 | rtx dest; | |
92 | ||
31369f5a AS |
93 | if (TARGET_ALTIVEC |
94 | && ((bytes >= 16 && align >= 128) | |
95 | || (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX))) | |
8845cb37 AS |
96 | { |
97 | clear_bytes = 16; | |
98 | mode = V4SImode; | |
99 | } | |
100 | else if (bytes >= 8 && TARGET_POWERPC64 | |
101 | && (align >= 64 || !STRICT_ALIGNMENT)) | |
102 | { | |
103 | clear_bytes = 8; | |
104 | mode = DImode; | |
105 | if (offset == 0 && align < 64) | |
106 | { | |
107 | rtx addr; | |
108 | ||
109 | /* If the address form is reg+offset with offset not a | |
110 | multiple of four, reload into reg indirect form here | |
111 | rather than waiting for reload. This way we get one | |
112 | reload, not one per store. */ | |
113 | addr = XEXP (orig_dest, 0); | |
114 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
115 | && GET_CODE (XEXP (addr, 1)) == CONST_INT | |
116 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) | |
117 | { | |
118 | addr = copy_addr_to_reg (addr); | |
119 | orig_dest = replace_equiv_address (orig_dest, addr); | |
120 | } | |
121 | } | |
122 | } | |
123 | else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) | |
124 | { /* move 4 bytes */ | |
125 | clear_bytes = 4; | |
126 | mode = SImode; | |
127 | } | |
128 | else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) | |
129 | { /* move 2 bytes */ | |
130 | clear_bytes = 2; | |
131 | mode = HImode; | |
132 | } | |
133 | else /* move 1 byte at a time */ | |
134 | { | |
135 | clear_bytes = 1; | |
136 | mode = QImode; | |
137 | } | |
138 | ||
139 | dest = adjust_address (orig_dest, mode, offset); | |
140 | ||
141 | emit_move_insn (dest, CONST0_RTX (mode)); | |
142 | } | |
143 | ||
144 | return 1; | |
145 | } | |
146 | ||
147 | /* Figure out the correct instructions to generate to load data for | |
148 | block compare. MODE is used for the read from memory, and | |
149 | data is zero extended if REG is wider than MODE. If LE code | |
150 | is being generated, bswap loads are used. | |
151 | ||
152 | REG is the destination register to move the data into. | |
153 | MEM is the memory block being read. | |
154 | MODE is the mode of memory to use for the read. */ | |
155 | static void | |
156 | do_load_for_compare (rtx reg, rtx mem, machine_mode mode) | |
157 | { | |
158 | switch (GET_MODE (reg)) | |
159 | { | |
9d36bd3b AS |
160 | case E_V16QImode: |
161 | switch (mode) | |
162 | { | |
163 | case E_V16QImode: | |
164 | if (!BYTES_BIG_ENDIAN) | |
165 | { | |
166 | if (TARGET_P9_VECTOR) | |
167 | emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem)); | |
168 | else | |
169 | { | |
170 | rtx reg_v2di = simplify_gen_subreg (V2DImode, reg, | |
171 | V16QImode, 0); | |
172 | gcc_assert (MEM_P (mem)); | |
173 | rtx addr = XEXP (mem, 0); | |
174 | rtx mem_v2di = gen_rtx_MEM (V2DImode, addr); | |
175 | MEM_COPY_ATTRIBUTES (mem_v2di, mem); | |
176 | set_mem_size (mem, GET_MODE_SIZE (V2DImode)); | |
177 | emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di)); | |
178 | } | |
179 | } | |
180 | else | |
181 | emit_insn (gen_vsx_movv2di_64bit (reg, mem)); | |
182 | break; | |
183 | default: | |
184 | gcc_unreachable (); | |
185 | } | |
186 | break; | |
4e10a5a7 | 187 | case E_DImode: |
8845cb37 AS |
188 | switch (mode) |
189 | { | |
4e10a5a7 | 190 | case E_QImode: |
8845cb37 AS |
191 | emit_insn (gen_zero_extendqidi2 (reg, mem)); |
192 | break; | |
4e10a5a7 | 193 | case E_HImode: |
8845cb37 AS |
194 | { |
195 | rtx src = mem; | |
196 | if (!BYTES_BIG_ENDIAN) | |
197 | { | |
198 | src = gen_reg_rtx (HImode); | |
199 | emit_insn (gen_bswaphi2 (src, mem)); | |
200 | } | |
201 | emit_insn (gen_zero_extendhidi2 (reg, src)); | |
202 | break; | |
203 | } | |
4e10a5a7 | 204 | case E_SImode: |
8845cb37 AS |
205 | { |
206 | rtx src = mem; | |
207 | if (!BYTES_BIG_ENDIAN) | |
208 | { | |
209 | src = gen_reg_rtx (SImode); | |
210 | emit_insn (gen_bswapsi2 (src, mem)); | |
211 | } | |
212 | emit_insn (gen_zero_extendsidi2 (reg, src)); | |
213 | } | |
214 | break; | |
4e10a5a7 | 215 | case E_DImode: |
8845cb37 AS |
216 | if (!BYTES_BIG_ENDIAN) |
217 | emit_insn (gen_bswapdi2 (reg, mem)); | |
218 | else | |
219 | emit_insn (gen_movdi (reg, mem)); | |
220 | break; | |
221 | default: | |
222 | gcc_unreachable (); | |
223 | } | |
224 | break; | |
225 | ||
4e10a5a7 | 226 | case E_SImode: |
8845cb37 AS |
227 | switch (mode) |
228 | { | |
4e10a5a7 | 229 | case E_QImode: |
8845cb37 AS |
230 | emit_insn (gen_zero_extendqisi2 (reg, mem)); |
231 | break; | |
4e10a5a7 | 232 | case E_HImode: |
8845cb37 AS |
233 | { |
234 | rtx src = mem; | |
235 | if (!BYTES_BIG_ENDIAN) | |
236 | { | |
237 | src = gen_reg_rtx (HImode); | |
238 | emit_insn (gen_bswaphi2 (src, mem)); | |
239 | } | |
240 | emit_insn (gen_zero_extendhisi2 (reg, src)); | |
241 | break; | |
242 | } | |
4e10a5a7 | 243 | case E_SImode: |
8845cb37 AS |
244 | if (!BYTES_BIG_ENDIAN) |
245 | emit_insn (gen_bswapsi2 (reg, mem)); | |
246 | else | |
247 | emit_insn (gen_movsi (reg, mem)); | |
248 | break; | |
4e10a5a7 | 249 | case E_DImode: |
8845cb37 AS |
250 | /* DImode is larger than the destination reg so is not expected. */ |
251 | gcc_unreachable (); | |
252 | break; | |
253 | default: | |
254 | gcc_unreachable (); | |
255 | } | |
256 | break; | |
9d36bd3b AS |
257 | |
258 | case E_QImode: | |
259 | gcc_assert (mode == E_QImode); | |
260 | emit_move_insn (reg, mem); | |
261 | break; | |
ef4adf1f | 262 | |
8845cb37 AS |
263 | default: |
264 | gcc_unreachable (); | |
265 | break; | |
266 | } | |
267 | } | |
268 | ||
269 | /* Select the mode to be used for reading the next chunk of bytes | |
270 | in the compare. | |
271 | ||
272 | OFFSET is the current read offset from the beginning of the block. | |
273 | BYTES is the number of bytes remaining to be read. | |
74f9986e | 274 | ALIGN is the minimum alignment of the memory blocks being compared in bytes. */ |
8845cb37 AS |
275 | static machine_mode |
276 | select_block_compare_mode (unsigned HOST_WIDE_INT offset, | |
277 | unsigned HOST_WIDE_INT bytes, | |
74f9986e | 278 | unsigned HOST_WIDE_INT align) |
8845cb37 AS |
279 | { |
280 | /* First see if we can do a whole load unit | |
281 | as that will be more efficient than a larger load + shift. */ | |
282 | ||
283 | /* If big, use biggest chunk. | |
284 | If exactly chunk size, use that size. | |
285 | If remainder can be done in one piece with shifting, do that. | |
286 | Do largest chunk possible without violating alignment rules. */ | |
287 | ||
288 | /* The most we can read without potential page crossing. */ | |
289 | unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align); | |
290 | ||
74f9986e AS |
291 | /* If we have an LE target without ldbrx and word_mode is DImode, |
292 | then we must avoid using word_mode. */ | |
293 | int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX | |
294 | && word_mode == DImode); | |
295 | ||
8845cb37 AS |
296 | if (word_mode_ok && bytes >= UNITS_PER_WORD) |
297 | return word_mode; | |
298 | else if (bytes == GET_MODE_SIZE (SImode)) | |
299 | return SImode; | |
300 | else if (bytes == GET_MODE_SIZE (HImode)) | |
301 | return HImode; | |
302 | else if (bytes == GET_MODE_SIZE (QImode)) | |
303 | return QImode; | |
304 | else if (bytes < GET_MODE_SIZE (SImode) | |
f7e94dfb | 305 | && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED |
8845cb37 AS |
306 | && offset >= GET_MODE_SIZE (SImode) - bytes) |
307 | /* This matches the case were we have SImode and 3 bytes | |
308 | and offset >= 1 and permits us to move back one and overlap | |
309 | with the previous read, thus avoiding having to shift | |
310 | unwanted bytes off of the input. */ | |
311 | return SImode; | |
312 | else if (word_mode_ok && bytes < UNITS_PER_WORD | |
f7e94dfb | 313 | && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED |
8845cb37 AS |
314 | && offset >= UNITS_PER_WORD-bytes) |
315 | /* Similarly, if we can use DImode it will get matched here and | |
316 | can do an overlapping read that ends at the end of the block. */ | |
317 | return word_mode; | |
318 | else if (word_mode_ok && maxread >= UNITS_PER_WORD) | |
319 | /* It is safe to do all remaining in one load of largest size, | |
320 | possibly with a shift to get rid of unwanted bytes. */ | |
321 | return word_mode; | |
322 | else if (maxread >= GET_MODE_SIZE (SImode)) | |
323 | /* It is safe to do all remaining in one SImode load, | |
324 | possibly with a shift to get rid of unwanted bytes. */ | |
325 | return SImode; | |
326 | else if (bytes > GET_MODE_SIZE (SImode)) | |
327 | return SImode; | |
328 | else if (bytes > GET_MODE_SIZE (HImode)) | |
329 | return HImode; | |
330 | ||
331 | /* final fallback is do one byte */ | |
332 | return QImode; | |
333 | } | |
334 | ||
335 | /* Compute the alignment of pointer+OFFSET where the original alignment | |
336 | of pointer was BASE_ALIGN. */ | |
337 | static unsigned HOST_WIDE_INT | |
338 | compute_current_alignment (unsigned HOST_WIDE_INT base_align, | |
339 | unsigned HOST_WIDE_INT offset) | |
340 | { | |
341 | if (offset == 0) | |
342 | return base_align; | |
343 | return MIN (base_align, offset & -offset); | |
344 | } | |
345 | ||
5ec3397e AS |
346 | /* Prepare address and then do a load. |
347 | ||
348 | MODE is the mode to use for the load. | |
349 | DEST is the destination register for the data. | |
350 | ADDR is the address to be loaded. | |
351 | ORIG_ADDR is the original address expression. */ | |
352 | static void | |
353 | do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr, | |
354 | rtx orig_addr) | |
355 | { | |
356 | rtx mem = gen_rtx_MEM (mode, addr); | |
357 | MEM_COPY_ATTRIBUTES (mem, orig_addr); | |
358 | set_mem_size (mem, GET_MODE_SIZE (mode)); | |
359 | do_load_for_compare (dest, mem, mode); | |
360 | return; | |
361 | } | |
362 | ||
363 | /* Do a branch for an if/else decision. | |
364 | ||
365 | CMPMODE is the mode to use for the comparison. | |
366 | COMPARISON is the rtx code for the compare needed. | |
367 | A is the first thing to be compared. | |
368 | B is the second thing to be compared. | |
369 | CR is the condition code reg input, or NULL_RTX. | |
370 | TRUE_LABEL is the label to branch to if the condition is true. | |
371 | ||
372 | The return value is the CR used for the comparison. | |
373 | If CR is null_rtx, then a new register of CMPMODE is generated. | |
374 | If A and B are both null_rtx, then CR must not be null, and the | |
375 | compare is not generated so you can use this with a dot form insn. */ | |
376 | ||
377 | static void | |
378 | do_ifelse (machine_mode cmpmode, rtx_code comparison, | |
379 | rtx a, rtx b, rtx cr, rtx true_label) | |
380 | { | |
381 | gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX) | |
382 | || (a != NULL_RTX && b != NULL_RTX)); | |
383 | ||
384 | if (cr != NULL_RTX) | |
385 | gcc_assert (GET_MODE (cr) == cmpmode); | |
386 | else | |
387 | cr = gen_reg_rtx (cmpmode); | |
388 | ||
389 | rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label); | |
390 | ||
391 | if (a != NULL_RTX) | |
392 | emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b)); | |
393 | ||
394 | rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx); | |
395 | ||
396 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx); | |
397 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
398 | JUMP_LABEL (j) = true_label; | |
399 | LABEL_NUSES (true_label) += 1; | |
400 | } | |
401 | ||
402 | /* Emit an isel of the proper mode for DEST. | |
403 | ||
404 | DEST is the isel destination register. | |
405 | SRC1 is the isel source if CR is true. | |
406 | SRC2 is the isel source if CR is false. | |
407 | CR is the condition for the isel. */ | |
408 | static void | |
409 | do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr) | |
410 | { | |
411 | if (GET_MODE (dest) == DImode) | |
412 | emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr)); | |
413 | else | |
414 | emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr)); | |
415 | } | |
416 | ||
417 | /* Emit a subtract of the proper mode for DEST. | |
418 | ||
419 | DEST is the destination register for the subtract. | |
420 | SRC1 is the first subtract input. | |
421 | SRC2 is the second subtract input. | |
422 | ||
423 | Computes DEST = SRC1-SRC2. */ | |
424 | static void | |
425 | do_sub3 (rtx dest, rtx src1, rtx src2) | |
426 | { | |
427 | if (GET_MODE (dest) == DImode) | |
428 | emit_insn (gen_subdi3 (dest, src1, src2)); | |
429 | else | |
430 | emit_insn (gen_subsi3 (dest, src1, src2)); | |
431 | } | |
432 | ||
433 | /* Emit an add of the proper mode for DEST. | |
434 | ||
435 | DEST is the destination register for the add. | |
436 | SRC1 is the first add input. | |
437 | SRC2 is the second add input. | |
438 | ||
439 | Computes DEST = SRC1+SRC2. */ | |
440 | static void | |
441 | do_add3 (rtx dest, rtx src1, rtx src2) | |
442 | { | |
443 | if (GET_MODE (dest) == DImode) | |
444 | emit_insn (gen_adddi3 (dest, src1, src2)); | |
445 | else | |
446 | emit_insn (gen_addsi3 (dest, src1, src2)); | |
447 | } | |
448 | ||
f7e94dfb AS |
449 | /* Emit an and of the proper mode for DEST. |
450 | ||
451 | DEST is the destination register for the and. | |
452 | SRC1 is the first and input. | |
453 | SRC2 is the second and input. | |
454 | ||
455 | Computes DEST = SRC1&SRC2. */ | |
456 | static void | |
457 | do_and3 (rtx dest, rtx src1, rtx src2) | |
458 | { | |
459 | if (GET_MODE (dest) == DImode) | |
460 | emit_insn (gen_anddi3 (dest, src1, src2)); | |
461 | else | |
462 | emit_insn (gen_andsi3 (dest, src1, src2)); | |
463 | } | |
464 | ||
465 | /* Emit an cmpb of the proper mode for DEST. | |
466 | ||
467 | DEST is the destination register for the cmpb. | |
468 | SRC1 is the first input. | |
469 | SRC2 is the second input. | |
470 | ||
471 | Computes cmpb of SRC1, SRC2. */ | |
472 | static void | |
473 | do_cmpb3 (rtx dest, rtx src1, rtx src2) | |
474 | { | |
475 | if (GET_MODE (dest) == DImode) | |
476 | emit_insn (gen_cmpbdi3 (dest, src1, src2)); | |
477 | else | |
478 | emit_insn (gen_cmpbsi3 (dest, src1, src2)); | |
479 | } | |
480 | ||
481 | /* Emit a rotl of the proper mode for DEST. | |
482 | ||
483 | DEST is the destination register for the and. | |
484 | SRC1 is the first and input. | |
485 | SRC2 is the second and input. | |
486 | ||
487 | Computes DEST = SRC1 rotated left by SRC2. */ | |
488 | static void | |
489 | do_rotl3 (rtx dest, rtx src1, rtx src2) | |
490 | { | |
491 | if (GET_MODE (dest) == DImode) | |
492 | emit_insn (gen_rotldi3 (dest, src1, src2)); | |
493 | else | |
494 | emit_insn (gen_rotlsi3 (dest, src1, src2)); | |
495 | } | |
496 | ||
5ec3397e AS |
497 | /* Generate rtl for a load, shift, and compare of less than a full word. |
498 | ||
499 | LOAD_MODE is the machine mode for the loads. | |
500 | DIFF is the reg for the difference. | |
501 | CMP_REM is the reg containing the remaining bytes to compare. | |
502 | DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. | |
503 | SRC1_ADDR is the first source address. | |
504 | SRC2_ADDR is the second source address. | |
505 | ORIG_SRC1 is the original first source block's address rtx. | |
506 | ORIG_SRC2 is the original second source block's address rtx. */ | |
507 | static void | |
508 | do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond, | |
509 | rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2) | |
510 | { | |
511 | HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); | |
512 | rtx shift_amount = gen_reg_rtx (word_mode); | |
513 | rtx d1 = gen_reg_rtx (word_mode); | |
514 | rtx d2 = gen_reg_rtx (word_mode); | |
515 | ||
516 | do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1); | |
517 | do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2); | |
518 | do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem); | |
519 | ||
520 | if (word_mode == DImode) | |
521 | { | |
522 | emit_insn (gen_ashldi3 (shift_amount, shift_amount, | |
523 | GEN_INT (LOG2_BITS_PER_UNIT))); | |
524 | emit_insn (gen_lshrdi3 (d1, d1, | |
525 | gen_lowpart (SImode, shift_amount))); | |
526 | emit_insn (gen_lshrdi3 (d2, d2, | |
527 | gen_lowpart (SImode, shift_amount))); | |
528 | } | |
529 | else | |
530 | { | |
531 | emit_insn (gen_ashlsi3 (shift_amount, shift_amount, | |
532 | GEN_INT (LOG2_BITS_PER_UNIT))); | |
533 | emit_insn (gen_lshrsi3 (d1, d1, shift_amount)); | |
534 | emit_insn (gen_lshrsi3 (d2, d2, shift_amount)); | |
535 | } | |
536 | ||
537 | if (TARGET_P9_MISC) | |
538 | { | |
539 | /* Generate a compare, and convert with a setb later. */ | |
540 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); | |
541 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
542 | } | |
543 | else | |
544 | { | |
545 | if (word_mode == DImode) | |
546 | emit_insn (gen_subfdi3_carry (diff, d2, d1)); | |
547 | else | |
548 | emit_insn (gen_subfsi3_carry (diff, d2, d1)); | |
549 | } | |
550 | } | |
551 | ||
552 | /* Generate rtl for an overlapping load and compare of less than a | |
553 | full load_mode. This assumes that the previous word is part of the | |
554 | block being compared so it's ok to back up part of a word so we can | |
555 | compare the last unaligned full word that ends at the end of the block. | |
556 | ||
557 | LOAD_MODE is the machine mode for the loads. | |
558 | ISCONST tells whether the remaining length is a constant or in a register. | |
559 | BYTES_REM is the remaining length if ISCONST is true. | |
560 | DIFF is the reg for the difference. | |
561 | CMP_REM is the reg containing the remaining bytes to compare if !ISCONST. | |
562 | DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. | |
563 | SRC1_ADDR is the first source address. | |
564 | SRC2_ADDR is the second source address. | |
565 | ORIG_SRC1 is the original first source block's address rtx. | |
566 | ORIG_SRC2 is the original second source block's address rtx. */ | |
567 | static void | |
568 | do_overlap_load_compare (machine_mode load_mode, bool isConst, | |
569 | HOST_WIDE_INT bytes_rem, rtx diff, | |
570 | rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr, | |
571 | rtx orig_src1, rtx orig_src2) | |
572 | { | |
573 | HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); | |
574 | HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem; | |
575 | rtx d1 = gen_reg_rtx (word_mode); | |
576 | rtx d2 = gen_reg_rtx (word_mode); | |
577 | ||
578 | rtx addr1, addr2; | |
579 | if (!isConst || addr_adj) | |
580 | { | |
581 | rtx adj_reg = gen_reg_rtx (word_mode); | |
582 | if (isConst) | |
583 | emit_move_insn (adj_reg, GEN_INT (-addr_adj)); | |
584 | else | |
585 | { | |
586 | rtx reg_lms = gen_reg_rtx (word_mode); | |
587 | emit_move_insn (reg_lms, GEN_INT (load_mode_size)); | |
588 | do_sub3 (adj_reg, cmp_rem, reg_lms); | |
589 | } | |
590 | ||
591 | addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg); | |
592 | addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg); | |
593 | } | |
594 | else | |
595 | { | |
596 | addr1 = src1_addr; | |
597 | addr2 = src2_addr; | |
598 | } | |
599 | ||
600 | do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1); | |
601 | do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2); | |
602 | ||
603 | if (TARGET_P9_MISC) | |
604 | { | |
605 | /* Generate a compare, and convert with a setb later. */ | |
606 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); | |
607 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
608 | } | |
609 | else | |
610 | { | |
611 | if (word_mode == DImode) | |
612 | emit_insn (gen_subfdi3_carry (diff, d2, d1)); | |
613 | else | |
614 | emit_insn (gen_subfsi3_carry (diff, d2, d1)); | |
615 | } | |
616 | } | |
617 | ||
618 | /* Expand a block compare operation using loop code, and return true | |
619 | if successful. Return false if we should let the compiler generate | |
620 | normal code, probably a memcmp call. | |
621 | ||
622 | OPERANDS[0] is the target (result). | |
623 | OPERANDS[1] is the first source. | |
624 | OPERANDS[2] is the second source. | |
625 | OPERANDS[3] is the length. | |
626 | OPERANDS[4] is the alignment. */ | |
627 | bool | |
628 | expand_compare_loop (rtx operands[]) | |
629 | { | |
630 | rtx target = operands[0]; | |
631 | rtx orig_src1 = operands[1]; | |
632 | rtx orig_src2 = operands[2]; | |
633 | rtx bytes_rtx = operands[3]; | |
634 | rtx align_rtx = operands[4]; | |
635 | ||
636 | /* This case is complicated to handle because the subtract | |
637 | with carry instructions do not generate the 64-bit | |
638 | carry and so we must emit code to calculate it ourselves. | |
639 | We choose not to implement this yet. */ | |
640 | if (TARGET_32BIT && TARGET_POWERPC64) | |
641 | return false; | |
642 | ||
643 | /* Allow non-const length. */ | |
644 | int bytes_is_const = CONST_INT_P (bytes_rtx); | |
645 | ||
646 | /* This must be a fixed size alignment. */ | |
647 | if (!CONST_INT_P (align_rtx)) | |
648 | return false; | |
649 | ||
650 | HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; | |
651 | HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; | |
652 | HOST_WIDE_INT minalign = MIN (align1, align2); | |
653 | ||
654 | bool isP7 = (rs6000_tune == PROCESSOR_POWER7); | |
655 | ||
656 | gcc_assert (GET_MODE (target) == SImode); | |
657 | ||
658 | /* Anything to move? */ | |
659 | HOST_WIDE_INT bytes = 0; | |
660 | if (bytes_is_const) | |
661 | bytes = INTVAL (bytes_rtx); | |
662 | ||
663 | if (bytes_is_const && bytes == 0) | |
664 | return true; | |
665 | ||
666 | /* Limit the amount we compare, if known statically. */ | |
667 | HOST_WIDE_INT max_bytes; | |
668 | switch (rs6000_tune) | |
669 | { | |
670 | case PROCESSOR_POWER7: | |
671 | if (!bytes_is_const) | |
672 | if (minalign < 8) | |
673 | max_bytes = 0; | |
674 | else | |
675 | max_bytes = 128; | |
676 | else | |
677 | if (minalign < 8) | |
678 | max_bytes = 32; | |
679 | else | |
680 | max_bytes = 128; | |
681 | break; | |
682 | case PROCESSOR_POWER8: | |
683 | if (!bytes_is_const) | |
684 | max_bytes = 0; | |
685 | else | |
686 | if (minalign < 8) | |
687 | max_bytes = 128; | |
688 | else | |
689 | max_bytes = 64; | |
690 | break; | |
691 | case PROCESSOR_POWER9: | |
692 | if (bytes_is_const) | |
693 | max_bytes = 191; | |
694 | else | |
695 | max_bytes = 0; | |
696 | break; | |
697 | default: | |
698 | max_bytes = 128; | |
699 | } | |
700 | ||
701 | /* Allow the option to override the default. */ | |
702 | if (rs6000_block_compare_inline_loop_limit >= 0) | |
703 | max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit; | |
704 | ||
705 | if (max_bytes == 0) | |
706 | return false; | |
707 | ||
708 | rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */ | |
709 | rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */ | |
710 | HOST_WIDE_INT niter; | |
711 | rtx iter = gen_reg_rtx (word_mode); | |
712 | rtx iv1 = gen_reg_rtx (word_mode); | |
713 | rtx iv2 = gen_reg_rtx (word_mode); | |
714 | rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */ | |
715 | rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */ | |
716 | rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */ | |
717 | rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */ | |
718 | ||
719 | /* Strip unneeded subreg from length if there is one. */ | |
720 | if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx)) | |
721 | bytes_rtx = SUBREG_REG (bytes_rtx); | |
722 | /* Extend bytes_rtx to word_mode if needed. But, we expect only to | |
723 | maybe have to deal with the case were bytes_rtx is SImode and | |
724 | word_mode is DImode. */ | |
725 | if (!bytes_is_const) | |
726 | { | |
727 | if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode)) | |
728 | /* Do not expect length longer than word_mode. */ | |
ef4adf1f | 729 | return false; |
5ec3397e AS |
730 | else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode)) |
731 | { | |
732 | bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); | |
733 | bytes_rtx = force_reg (word_mode, | |
734 | gen_rtx_fmt_e (ZERO_EXTEND, word_mode, | |
735 | bytes_rtx)); | |
736 | } | |
737 | else | |
738 | /* Make sure it's in a register before we get started. */ | |
739 | bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); | |
740 | } | |
741 | ||
742 | machine_mode load_mode = word_mode; | |
743 | HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); | |
744 | ||
745 | /* Number of bytes per iteration of the unrolled loop. */ | |
746 | HOST_WIDE_INT loop_bytes = 2 * load_mode_size; | |
747 | /* max iters and bytes compared in the loop. */ | |
748 | HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes; | |
749 | HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes; | |
750 | int l2lb = floor_log2 (loop_bytes); | |
751 | ||
752 | if (bytes_is_const && (max_bytes < load_mode_size | |
753 | || !IN_RANGE (bytes, load_mode_size, max_bytes))) | |
754 | return false; | |
755 | ||
756 | bool no_remainder_code = false; | |
757 | rtx final_label = gen_label_rtx (); | |
758 | rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
759 | rtx diff_label = gen_label_rtx (); | |
760 | rtx library_call_label = NULL; | |
761 | rtx cleanup_label = gen_label_rtx (); | |
762 | ||
763 | rtx cr; | |
764 | ||
765 | rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0)); | |
766 | rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0)); | |
767 | ||
768 | /* Difference found is stored here before jump to diff_label. */ | |
769 | rtx diff = gen_reg_rtx (word_mode); | |
770 | rtx j; | |
771 | ||
772 | /* Example of generated code for 35 bytes aligned 1 byte. | |
ef4adf1f | 773 | |
5ec3397e AS |
774 | mtctr 8 |
775 | li 6,0 | |
776 | li 5,8 | |
777 | .L13: | |
778 | ldbrx 7,3,6 | |
779 | ldbrx 9,10,6 | |
780 | ldbrx 0,3,5 | |
781 | ldbrx 4,10,5 | |
782 | addi 6,6,16 | |
783 | addi 5,5,16 | |
784 | subfc. 9,9,7 | |
785 | bne 0,.L10 | |
786 | subfc. 9,4,0 | |
787 | bdnzt 2,.L13 | |
788 | bne 0,.L10 | |
789 | add 3,3,6 | |
790 | add 10,10,6 | |
791 | addi 9,3,-5 | |
792 | ldbrx 7,0,9 | |
793 | addi 9,10,-5 | |
794 | ldbrx 9,0,9 | |
795 | subfc 9,9,7 | |
796 | .p2align 4,,15 | |
797 | .L10: | |
798 | popcntd 9,9 | |
799 | subfe 10,10,10 | |
800 | or 9,9,10 | |
ef4adf1f | 801 | |
5ec3397e AS |
802 | Compiled with -fno-reorder-blocks for clarity. */ |
803 | ||
804 | /* Structure of what we're going to do: | |
805 | Two separate lengths: what we will compare before bailing to library | |
806 | call (max_bytes), and the total length to be checked. | |
807 | if length <= 16, branch to linear cleanup code starting with | |
808 | remainder length check (length not known at compile time) | |
809 | set up 2 iv's and load count reg, compute remainder length | |
810 | unrollx2 compare loop | |
811 | if loop exit due to a difference, branch to difference handling code | |
812 | if remainder length < 8, branch to final cleanup compare | |
813 | load and compare 8B | |
814 | final cleanup comparison (depends on alignment and length) | |
815 | load 8B, shift off bytes past length, compare | |
816 | load 8B ending at last byte and compare | |
817 | load/compare 1 byte at a time (short block abutting 4k boundary) | |
818 | difference handling, 64->32 conversion | |
819 | final result | |
820 | branch around memcmp call | |
821 | memcmp library call | |
822 | */ | |
823 | ||
824 | /* If bytes is not const, compare length and branch directly | |
825 | to the cleanup code that can handle 0-16 bytes if length | |
826 | is >= 16. Stash away bytes-max_bytes for the library call. */ | |
827 | if (bytes_is_const) | |
828 | { | |
829 | /* These need to be set for some of the places we may jump to. */ | |
830 | if (bytes > max_bytes) | |
831 | { | |
832 | no_remainder_code = true; | |
833 | niter = max_loop_iter; | |
834 | library_call_label = gen_label_rtx (); | |
835 | } | |
836 | else | |
837 | { | |
838 | niter = bytes / loop_bytes; | |
839 | } | |
840 | emit_move_insn (iter, GEN_INT (niter)); | |
841 | emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes)); | |
842 | emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes)); | |
843 | } | |
844 | else | |
845 | { | |
846 | library_call_label = gen_label_rtx (); | |
847 | ||
848 | /* If we go to the cleanup code, it expects length to be in cmp_rem. */ | |
849 | emit_move_insn (cmp_rem, bytes_rtx); | |
850 | ||
851 | /* Check for > max_bytes bytes. We want to bail out as quickly as | |
852 | possible if we have to go over to memcmp. */ | |
853 | do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes), | |
854 | NULL_RTX, library_call_label); | |
855 | ||
856 | /* Check for < loop_bytes bytes. */ | |
857 | do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes), | |
858 | NULL_RTX, cleanup_label); | |
859 | ||
860 | /* Loop compare bytes and iterations if bytes>max_bytes. */ | |
861 | rtx mb_reg = gen_reg_rtx (word_mode); | |
862 | emit_move_insn (mb_reg, GEN_INT (max_loop_bytes)); | |
863 | rtx mi_reg = gen_reg_rtx (word_mode); | |
864 | emit_move_insn (mi_reg, GEN_INT (max_loop_iter)); | |
865 | ||
866 | /* Compute number of loop iterations if bytes <= max_bytes. */ | |
867 | if (word_mode == DImode) | |
868 | emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb))); | |
869 | else | |
870 | emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb))); | |
871 | ||
872 | /* Compute bytes to compare in loop if bytes <= max_bytes. */ | |
873 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb); | |
874 | if (word_mode == DImode) | |
875 | { | |
876 | emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask)); | |
877 | } | |
878 | else | |
879 | { | |
880 | emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask)); | |
881 | } | |
882 | ||
883 | /* Check for bytes <= max_bytes. */ | |
884 | if (TARGET_ISEL) | |
885 | { | |
886 | /* P9 has fast isel so we use one compare and two isel. */ | |
887 | cr = gen_reg_rtx (CCmode); | |
888 | rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx, | |
889 | GEN_INT (max_bytes)); | |
890 | emit_move_insn (cr, compare_rtx); | |
891 | rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx); | |
892 | do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr); | |
893 | do_isel (iter, cmp_rtx, iter, mi_reg, cr); | |
894 | } | |
895 | else | |
896 | { | |
897 | rtx lab_after = gen_label_rtx (); | |
898 | do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes), | |
899 | NULL_RTX, lab_after); | |
900 | emit_move_insn (loop_cmp, mb_reg); | |
901 | emit_move_insn (iter, mi_reg); | |
902 | emit_label (lab_after); | |
903 | } | |
904 | ||
905 | /* Now compute remainder bytes which isn't used until after the loop. */ | |
906 | do_sub3 (cmp_rem, bytes_rtx, loop_cmp); | |
907 | } | |
908 | ||
909 | rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */ | |
910 | /* For p9 we need to have just one of these as multiple places define | |
911 | it and it gets used by the setb at the end. */ | |
912 | if (TARGET_P9_MISC) | |
913 | dcond = gen_reg_rtx (CCUNSmode); | |
914 | ||
915 | if (!bytes_is_const || bytes >= loop_bytes) | |
916 | { | |
917 | /* It should not be possible to come here if remaining bytes is | |
918 | < 16 in the runtime case either. Compute number of loop | |
919 | iterations. We compare 2*word_mode per iteration so 16B for | |
920 | 64-bit code and 8B for 32-bit. Set up two induction | |
921 | variables and load count register. */ | |
922 | ||
923 | /* HACK ALERT: create hard reg for CTR here. If we just use a | |
924 | pseudo, cse will get rid of it and then the allocator will | |
925 | see it used in the lshr above and won't give us ctr. */ | |
926 | rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); | |
927 | emit_move_insn (ctr, iter); | |
928 | emit_move_insn (diff, GEN_INT (0)); | |
929 | emit_move_insn (iv1, GEN_INT (0)); | |
930 | emit_move_insn (iv2, GEN_INT (load_mode_size)); | |
931 | ||
932 | /* inner loop to compare 2*word_mode */ | |
933 | rtx loop_top_label = gen_label_rtx (); | |
934 | emit_label (loop_top_label); | |
935 | ||
936 | rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1); | |
937 | rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1); | |
938 | ||
939 | do_load_for_compare_from_addr (load_mode, d1_1, | |
940 | src1_ix1, orig_src1); | |
941 | do_load_for_compare_from_addr (load_mode, d2_1, | |
942 | src2_ix1, orig_src2); | |
943 | do_add3 (iv1, iv1, GEN_INT (loop_bytes)); | |
944 | ||
945 | rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2); | |
946 | rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2); | |
947 | ||
948 | do_load_for_compare_from_addr (load_mode, d1_2, | |
949 | src1_ix2, orig_src1); | |
950 | do_load_for_compare_from_addr (load_mode, d2_2, | |
951 | src2_ix2, orig_src2); | |
952 | do_add3 (iv2, iv2, GEN_INT (loop_bytes)); | |
953 | ||
954 | if (TARGET_P9_MISC) | |
955 | { | |
956 | /* Generate a compare, and convert with a setb later. */ | |
957 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); | |
958 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
959 | } | |
960 | else | |
961 | { | |
962 | dcond = gen_reg_rtx (CCmode); | |
963 | if (word_mode == DImode) | |
964 | emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
965 | else | |
966 | emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
967 | } | |
968 | ||
969 | do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, | |
970 | dcond, diff_label); | |
971 | ||
972 | if (TARGET_P9_MISC) | |
973 | { | |
974 | /* Generate a compare, and convert with a setb later. */ | |
975 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2); | |
976 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
977 | } | |
978 | else | |
979 | { | |
980 | dcond = gen_reg_rtx (CCmode); | |
981 | if (word_mode == DImode) | |
982 | emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond)); | |
983 | else | |
984 | emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond)); | |
985 | } | |
986 | ||
987 | rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2); | |
988 | if (TARGET_64BIT) | |
989 | j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr, | |
990 | eqrtx, dcond)); | |
991 | else | |
992 | j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr, | |
993 | eqrtx, dcond)); | |
994 | JUMP_LABEL (j) = loop_top_label; | |
995 | LABEL_NUSES (loop_top_label) += 1; | |
996 | } | |
997 | ||
998 | HOST_WIDE_INT bytes_remaining = 0; | |
999 | if (bytes_is_const) | |
1000 | bytes_remaining = (bytes % loop_bytes); | |
1001 | ||
1002 | /* If diff is nonzero, branch to difference handling | |
1003 | code. If we exit here with a nonzero diff, it is | |
1004 | because the second word differed. */ | |
1005 | if (TARGET_P9_MISC) | |
1006 | do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, diff_label); | |
1007 | else | |
1008 | do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, diff_label); | |
1009 | ||
1010 | if (library_call_label != NULL && bytes_is_const && bytes > max_bytes) | |
1011 | { | |
1012 | /* If the length is known at compile time, then we will always | |
1013 | have a remainder to go to the library call with. */ | |
1014 | rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label); | |
1015 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref)); | |
1016 | JUMP_LABEL (j) = library_call_label; | |
1017 | LABEL_NUSES (library_call_label) += 1; | |
1018 | emit_barrier (); | |
1019 | } | |
1020 | ||
1021 | if (bytes_is_const && bytes_remaining == 0) | |
1022 | { | |
1023 | /* No remainder and if we are here then diff is 0 so just return 0 */ | |
1024 | if (TARGET_64BIT) | |
1025 | emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); | |
1026 | else | |
1027 | emit_move_insn (target, diff); | |
1028 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); | |
1029 | JUMP_LABEL (j) = final_label; | |
1030 | LABEL_NUSES (final_label) += 1; | |
1031 | emit_barrier (); | |
1032 | } | |
1033 | else if (!no_remainder_code) | |
1034 | { | |
1035 | /* Update addresses to point to the next word to examine. */ | |
1036 | do_add3 (src1_addr, src1_addr, iv1); | |
1037 | do_add3 (src2_addr, src2_addr, iv1); | |
1038 | ||
1039 | emit_label (cleanup_label); | |
1040 | ||
1041 | if (!bytes_is_const) | |
1042 | { | |
1043 | /* If we're dealing with runtime length, we have to check if | |
ef4adf1f | 1044 | it's zero after the loop. When length is known at compile |
5ec3397e AS |
1045 | time the no-remainder condition is dealt with above. By |
1046 | doing this after cleanup_label, we also deal with the | |
1047 | case where length is 0 at the start and we bypass the | |
1048 | loop with a branch to cleanup_label. */ | |
1049 | emit_move_insn (target, const0_rtx); | |
1050 | do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, | |
1051 | NULL_RTX, final_label); | |
1052 | } | |
1053 | ||
1054 | rtx final_cleanup = gen_label_rtx (); | |
1055 | rtx cmp_rem_before = gen_reg_rtx (word_mode); | |
1056 | /* Compare one more word_mode chunk if needed. */ | |
37ca383f | 1057 | if (!bytes_is_const || bytes_remaining >= load_mode_size) |
5ec3397e AS |
1058 | { |
1059 | /* If remainder length < word length, branch to final | |
1060 | cleanup compare. */ | |
1061 | if (!bytes_is_const) | |
1062 | do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size), | |
1063 | NULL_RTX, final_cleanup); | |
1064 | ||
1065 | /* load and compare 8B */ | |
1066 | do_load_for_compare_from_addr (load_mode, d1_1, | |
1067 | src1_addr, orig_src1); | |
1068 | do_load_for_compare_from_addr (load_mode, d2_1, | |
1069 | src2_addr, orig_src2); | |
1070 | ||
1071 | /* Compare the word, see if we need to do the last partial. */ | |
1072 | if (TARGET_P9_MISC) | |
1073 | { | |
1074 | /* Generate a compare, and convert with a setb later. */ | |
1075 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); | |
1076 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
1077 | } | |
1078 | else | |
1079 | { | |
1080 | dcond = gen_reg_rtx (CCmode); | |
1081 | if (word_mode == DImode) | |
1082 | emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
1083 | else | |
1084 | emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
1085 | } | |
1086 | ||
1087 | do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, | |
1088 | dcond, diff_label); | |
1089 | ||
1090 | do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size)); | |
1091 | do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size)); | |
1092 | emit_move_insn (cmp_rem_before, cmp_rem); | |
1093 | do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size)); | |
1094 | if (bytes_is_const) | |
1095 | bytes_remaining -= load_mode_size; | |
1096 | else | |
1097 | /* See if remaining length is now zero. We previously set | |
1098 | target to 0 so we can just jump to the end. */ | |
1099 | do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, | |
1100 | NULL_RTX, final_label); | |
1101 | ||
1102 | } | |
1103 | ||
1104 | /* Cases: | |
1105 | bytes_is_const | |
1106 | We can always shift back to do an overlapping compare | |
1107 | of the last chunk because we know length >= 8. | |
1108 | ||
1109 | !bytes_is_const | |
1110 | align>=load_mode_size | |
1111 | Read word_mode and mask | |
1112 | align<load_mode_size | |
1113 | avoid stepping past end | |
1114 | ||
1115 | Three strategies: | |
1116 | * decrement address and do overlapping compare | |
1117 | * read word_mode and mask | |
1118 | * carefully avoid crossing 4k boundary | |
1119 | */ | |
1120 | ||
1121 | if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7)) | |
1122 | && align1 >= load_mode_size && align2 >= load_mode_size) | |
1123 | { | |
1124 | /* Alignment is larger than word_mode so we do not need to be | |
1125 | concerned with extra page crossings. But, we do not know | |
1126 | that the length is larger than load_mode_size so we might | |
1127 | end up compareing against data before the block if we try | |
1128 | an overlapping compare. Also we use this on P7 for fixed length | |
1129 | remainder because P7 doesn't like overlapping unaligned. | |
1130 | Strategy: load 8B, shift off bytes past length, and compare. */ | |
1131 | emit_label (final_cleanup); | |
1132 | do_load_mask_compare (load_mode, diff, cmp_rem, dcond, | |
1133 | src1_addr, src2_addr, orig_src1, orig_src2); | |
1134 | } | |
1135 | else if (bytes_remaining && bytes_is_const) | |
1136 | { | |
1137 | /* We do not do loop expand if length < 32 so we know at the | |
1138 | end we can do an overlapping compare. | |
1139 | Strategy: shift address back and do word_mode load that | |
1140 | ends at the end of the block. */ | |
1141 | emit_label (final_cleanup); | |
1142 | do_overlap_load_compare (load_mode, true, bytes_remaining, diff, | |
1143 | cmp_rem, dcond, src1_addr, src2_addr, | |
1144 | orig_src1, orig_src2); | |
1145 | } | |
1146 | else if (!bytes_is_const) | |
1147 | { | |
1148 | rtx handle4k_label = gen_label_rtx (); | |
1149 | rtx nonconst_overlap = gen_label_rtx (); | |
1150 | emit_label (nonconst_overlap); | |
1151 | ||
1152 | /* Here we have to handle the case where whe have runtime | |
1153 | length which may be too short for overlap compare, and | |
1154 | alignment is not at least load_mode_size so we have to | |
1155 | tread carefully to avoid stepping across 4k boundaries. */ | |
1156 | ||
1157 | /* If the length after the loop was larger than word_mode | |
1158 | size, we can just do an overlapping compare and we're | |
1159 | done. We fall through to this code from the word_mode | |
1160 | compare that preceeds this. */ | |
1161 | do_overlap_load_compare (load_mode, false, 0, diff, | |
1162 | cmp_rem, dcond, src1_addr, src2_addr, | |
1163 | orig_src1, orig_src2); | |
1164 | ||
1165 | rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label); | |
1166 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); | |
1167 | JUMP_LABEL (j) = diff_label; | |
1168 | LABEL_NUSES (diff_label) += 1; | |
1169 | emit_barrier (); | |
1170 | ||
1171 | /* If we couldn't do the overlap compare we have to be more | |
1172 | careful of the 4k boundary. Test to see if either | |
1173 | address is less than word_mode_size away from a 4k | |
1174 | boundary. If not, then we can do a load/shift/compare | |
1175 | and we are done. We come to this code if length was less | |
1176 | than word_mode_size. */ | |
1177 | ||
1178 | emit_label (final_cleanup); | |
1179 | ||
1180 | /* We can still avoid the slow case if the length was larger | |
1181 | than one loop iteration, in which case go do the overlap | |
1182 | load compare path. */ | |
1183 | do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes), | |
1184 | NULL_RTX, nonconst_overlap); | |
1185 | ||
1186 | rtx rem4k = gen_reg_rtx (word_mode); | |
1187 | rtx dist1 = gen_reg_rtx (word_mode); | |
1188 | rtx dist2 = gen_reg_rtx (word_mode); | |
1189 | do_sub3 (rem4k, GEN_INT (4096), cmp_rem); | |
1190 | if (word_mode == SImode) | |
1191 | emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff))); | |
1192 | else | |
1193 | emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff))); | |
1194 | do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, handle4k_label); | |
1195 | if (word_mode == SImode) | |
1196 | emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff))); | |
1197 | else | |
1198 | emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff))); | |
1199 | do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, handle4k_label); | |
1200 | ||
1201 | /* We don't have a 4k boundary to deal with, so do | |
1202 | a load/shift/compare and jump to diff. */ | |
1203 | ||
1204 | do_load_mask_compare (load_mode, diff, cmp_rem, dcond, | |
1205 | src1_addr, src2_addr, orig_src1, orig_src2); | |
1206 | ||
1207 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); | |
1208 | JUMP_LABEL (j) = diff_label; | |
1209 | LABEL_NUSES (diff_label) += 1; | |
1210 | emit_barrier (); | |
1211 | ||
1212 | /* Finally in the unlikely case we are inching up to a | |
1213 | 4k boundary we use a compact lbzx/compare loop to do | |
1214 | it a byte at a time. */ | |
1215 | ||
1216 | emit_label (handle4k_label); | |
1217 | ||
1218 | rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); | |
1219 | emit_move_insn (ctr, cmp_rem); | |
1220 | rtx ixreg = gen_reg_rtx (Pmode); | |
1221 | emit_move_insn (ixreg, const0_rtx); | |
1222 | ||
1223 | rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg); | |
1224 | rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg); | |
1225 | rtx d1 = gen_reg_rtx (word_mode); | |
1226 | rtx d2 = gen_reg_rtx (word_mode); | |
1227 | ||
1228 | rtx fc_loop = gen_label_rtx (); | |
1229 | emit_label (fc_loop); | |
1230 | ||
1231 | do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1); | |
1232 | do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2); | |
1233 | ||
1234 | do_add3 (ixreg, ixreg, const1_rtx); | |
1235 | ||
1236 | rtx cond = gen_reg_rtx (CCmode); | |
1237 | rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2); | |
1238 | rs6000_emit_dot_insn (diff, subexpr, 2, cond); | |
1239 | ||
1240 | rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2); | |
1241 | if (TARGET_64BIT) | |
1242 | j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr, | |
1243 | eqrtx, cond)); | |
1244 | else | |
1245 | j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr, | |
1246 | eqrtx, cond)); | |
1247 | JUMP_LABEL (j) = fc_loop; | |
1248 | LABEL_NUSES (fc_loop) += 1; | |
1249 | ||
1250 | if (TARGET_64BIT) | |
1251 | emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); | |
1252 | else | |
1253 | emit_move_insn (target, diff); | |
1254 | ||
1255 | /* Since we are comparing bytes, the difference can be used | |
1256 | as the final result and we are done here. */ | |
1257 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); | |
1258 | JUMP_LABEL (j) = final_label; | |
1259 | LABEL_NUSES (final_label) += 1; | |
1260 | emit_barrier (); | |
1261 | } | |
1262 | } | |
1263 | ||
1264 | emit_label (diff_label); | |
1265 | /* difference handling, 64->32 conversion */ | |
1266 | ||
1267 | /* We need to produce DI result from sub, then convert to target SI | |
1268 | while maintaining <0 / ==0 / >0 properties. This sequence works: | |
1269 | subfc L,A,B | |
1270 | subfe H,H,H | |
1271 | popcntd L,L | |
1272 | rldimi L,H,6,0 | |
1273 | ||
1274 | This is an alternate one Segher cooked up if somebody | |
1275 | wants to expand this for something that doesn't have popcntd: | |
1276 | subfc L,a,b | |
1277 | subfe H,x,x | |
1278 | addic t,L,-1 | |
1279 | subfe v,t,L | |
1280 | or z,v,H | |
1281 | ||
1282 | And finally, p9 can just do this: | |
1283 | cmpld A,B | |
1284 | setb r */ | |
1285 | ||
1286 | if (TARGET_P9_MISC) | |
1287 | emit_insn (gen_setb_unsigned (target, dcond)); | |
1288 | else | |
1289 | { | |
1290 | if (TARGET_64BIT) | |
1291 | { | |
1292 | rtx tmp_reg_ca = gen_reg_rtx (DImode); | |
1293 | emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); | |
1294 | emit_insn (gen_popcntddi2 (diff, diff)); | |
1295 | emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca)); | |
1296 | emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); | |
1297 | } | |
1298 | else | |
1299 | { | |
1300 | rtx tmp_reg_ca = gen_reg_rtx (SImode); | |
1301 | emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); | |
1302 | emit_insn (gen_popcntdsi2 (diff, diff)); | |
1303 | emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca)); | |
1304 | } | |
1305 | } | |
1306 | ||
1307 | if (library_call_label != NULL) | |
1308 | { | |
1309 | /* Branch around memcmp call. */ | |
1310 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); | |
1311 | JUMP_LABEL (j) = final_label; | |
1312 | LABEL_NUSES (final_label) += 1; | |
1313 | emit_barrier (); | |
1314 | ||
1315 | /* Make memcmp library call. cmp_rem is the remaining bytes that | |
1316 | were compared and cmp_rem is the expected amount to be compared | |
1317 | by memcmp. If we don't find a difference in the loop compare, do | |
1318 | the library call directly instead of doing a small compare just | |
1319 | to get to an arbitrary boundary before calling it anyway. | |
1320 | Also, update addresses to point to the next word to examine. */ | |
1321 | emit_label (library_call_label); | |
1322 | ||
1323 | rtx len_rtx = gen_reg_rtx (word_mode); | |
1324 | if (bytes_is_const) | |
1325 | { | |
1326 | emit_move_insn (len_rtx, cmp_rem); | |
1327 | do_add3 (src1_addr, src1_addr, iv1); | |
1328 | do_add3 (src2_addr, src2_addr, iv1); | |
1329 | } | |
1330 | else | |
1331 | emit_move_insn (len_rtx, bytes_rtx); | |
1332 | ||
1333 | tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP); | |
1334 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
1335 | target, LCT_NORMAL, GET_MODE (target), | |
1336 | src1_addr, Pmode, | |
1337 | src2_addr, Pmode, | |
1338 | len_rtx, GET_MODE (len_rtx)); | |
1339 | } | |
1340 | ||
1341 | /* emit final_label */ | |
1342 | emit_label (final_label); | |
1343 | return true; | |
1344 | } | |
1345 | ||
8845cb37 AS |
1346 | /* Expand a block compare operation, and return true if successful. |
1347 | Return false if we should let the compiler generate normal code, | |
1348 | probably a memcmp call. | |
1349 | ||
1350 | OPERANDS[0] is the target (result). | |
1351 | OPERANDS[1] is the first source. | |
1352 | OPERANDS[2] is the second source. | |
1353 | OPERANDS[3] is the length. | |
1354 | OPERANDS[4] is the alignment. */ | |
1355 | bool | |
1356 | expand_block_compare (rtx operands[]) | |
1357 | { | |
1358 | rtx target = operands[0]; | |
1359 | rtx orig_src1 = operands[1]; | |
1360 | rtx orig_src2 = operands[2]; | |
1361 | rtx bytes_rtx = operands[3]; | |
1362 | rtx align_rtx = operands[4]; | |
1363 | HOST_WIDE_INT cmp_bytes = 0; | |
1364 | rtx src1 = orig_src1; | |
1365 | rtx src2 = orig_src2; | |
1366 | ||
1367 | /* This case is complicated to handle because the subtract | |
1368 | with carry instructions do not generate the 64-bit | |
1369 | carry and so we must emit code to calculate it ourselves. | |
1370 | We choose not to implement this yet. */ | |
1371 | if (TARGET_32BIT && TARGET_POWERPC64) | |
1372 | return false; | |
1373 | ||
5ec3397e AS |
1374 | bool isP7 = (rs6000_tune == PROCESSOR_POWER7); |
1375 | ||
1376 | /* Allow this param to shut off all expansion. */ | |
1377 | if (rs6000_block_compare_inline_limit == 0) | |
1378 | return false; | |
1379 | ||
1380 | /* targetm.slow_unaligned_access -- don't do unaligned stuff. | |
1381 | However slow_unaligned_access returns true on P7 even though the | |
1382 | performance of this code is good there. */ | |
1383 | if (!isP7 | |
1384 | && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1)) | |
1385 | || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2)))) | |
8845cb37 AS |
1386 | return false; |
1387 | ||
5ec3397e AS |
1388 | /* Unaligned l*brx traps on P7 so don't do this. However this should |
1389 | not affect much because LE isn't really supported on P7 anyway. */ | |
1390 | if (isP7 && !BYTES_BIG_ENDIAN) | |
1391 | return false; | |
1392 | ||
1393 | /* If this is not a fixed size compare, try generating loop code and | |
1394 | if that fails just call memcmp. */ | |
1395 | if (!CONST_INT_P (bytes_rtx)) | |
1396 | return expand_compare_loop (operands); | |
1397 | ||
8845cb37 AS |
1398 | /* This must be a fixed size alignment. */ |
1399 | if (!CONST_INT_P (align_rtx)) | |
1400 | return false; | |
1401 | ||
1402 | unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT; | |
1403 | ||
8845cb37 AS |
1404 | gcc_assert (GET_MODE (target) == SImode); |
1405 | ||
1406 | /* Anything to move? */ | |
1407 | unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx); | |
1408 | if (bytes == 0) | |
1409 | return true; | |
1410 | ||
8845cb37 AS |
1411 | rtx tmp_reg_src1 = gen_reg_rtx (word_mode); |
1412 | rtx tmp_reg_src2 = gen_reg_rtx (word_mode); | |
1413 | /* P7/P8 code uses cond for subfc. but P9 uses | |
ef4adf1f | 1414 | it for cmpld which needs CCUNSmode. */ |
8845cb37 AS |
1415 | rtx cond; |
1416 | if (TARGET_P9_MISC) | |
1417 | cond = gen_reg_rtx (CCUNSmode); | |
1418 | else | |
1419 | cond = gen_reg_rtx (CCmode); | |
1420 | ||
8845cb37 AS |
1421 | /* Strategy phase. How many ops will this take and should we expand it? */ |
1422 | ||
1423 | unsigned HOST_WIDE_INT offset = 0; | |
1424 | machine_mode load_mode = | |
74f9986e | 1425 | select_block_compare_mode (offset, bytes, base_align); |
8845cb37 AS |
1426 | unsigned int load_mode_size = GET_MODE_SIZE (load_mode); |
1427 | ||
5ec3397e AS |
1428 | /* We don't want to generate too much code. The loop code can take |
1429 | over for lengths greater than 31 bytes. */ | |
1430 | unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit; | |
8845cb37 | 1431 | if (!IN_RANGE (bytes, 1, max_bytes)) |
5ec3397e AS |
1432 | return expand_compare_loop (operands); |
1433 | ||
1434 | /* The code generated for p7 and older is not faster than glibc | |
1435 | memcmp if alignment is small and length is not short, so bail | |
1436 | out to avoid those conditions. */ | |
1437 | if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED | |
1438 | && ((base_align == 1 && bytes > 16) | |
1439 | || (base_align == 2 && bytes > 32))) | |
8845cb37 AS |
1440 | return false; |
1441 | ||
1442 | bool generate_6432_conversion = false; | |
1443 | rtx convert_label = NULL; | |
1444 | rtx final_label = NULL; | |
1445 | ||
1446 | /* Example of generated code for 18 bytes aligned 1 byte. | |
1447 | Compiled with -fno-reorder-blocks for clarity. | |
1448 | ldbrx 10,31,8 | |
1449 | ldbrx 9,7,8 | |
1450 | subfc. 9,9,10 | |
1451 | bne 0,.L6487 | |
1452 | addi 9,12,8 | |
1453 | addi 5,11,8 | |
1454 | ldbrx 10,0,9 | |
1455 | ldbrx 9,0,5 | |
1456 | subfc. 9,9,10 | |
1457 | bne 0,.L6487 | |
1458 | addi 9,12,16 | |
1459 | lhbrx 10,0,9 | |
1460 | addi 9,11,16 | |
1461 | lhbrx 9,0,9 | |
1462 | subf 9,9,10 | |
1463 | b .L6488 | |
1464 | .p2align 4,,15 | |
1465 | .L6487: #convert_label | |
1466 | popcntd 9,9 | |
1467 | subfe 10,10,10 | |
1468 | or 9,9,10 | |
1469 | .L6488: #final_label | |
1470 | extsw 10,9 | |
1471 | ||
1472 | We start off with DImode for two blocks that jump to the DI->SI conversion | |
1473 | if the difference is found there, then a final block of HImode that skips | |
1474 | the DI->SI conversion. */ | |
1475 | ||
1476 | while (bytes > 0) | |
1477 | { | |
1478 | unsigned int align = compute_current_alignment (base_align, offset); | |
74f9986e | 1479 | load_mode = select_block_compare_mode (offset, bytes, align); |
8845cb37 AS |
1480 | load_mode_size = GET_MODE_SIZE (load_mode); |
1481 | if (bytes >= load_mode_size) | |
1482 | cmp_bytes = load_mode_size; | |
1483 | else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
1484 | { | |
1485 | /* Move this load back so it doesn't go past the end. | |
1486 | P8/P9 can do this efficiently. */ | |
1487 | unsigned int extra_bytes = load_mode_size - bytes; | |
1488 | cmp_bytes = bytes; | |
1489 | if (extra_bytes < offset) | |
1490 | { | |
1491 | offset -= extra_bytes; | |
1492 | cmp_bytes = load_mode_size; | |
1493 | bytes = cmp_bytes; | |
1494 | } | |
1495 | } | |
1496 | else | |
1497 | /* P7 and earlier can't do the overlapping load trick fast, | |
1498 | so this forces a non-overlapping load and a shift to get | |
1499 | rid of the extra bytes. */ | |
1500 | cmp_bytes = bytes; | |
1501 | ||
1502 | src1 = adjust_address (orig_src1, load_mode, offset); | |
1503 | src2 = adjust_address (orig_src2, load_mode, offset); | |
1504 | ||
1505 | if (!REG_P (XEXP (src1, 0))) | |
1506 | { | |
1507 | rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
1508 | src1 = replace_equiv_address (src1, src1_reg); | |
1509 | } | |
f4f867ca | 1510 | set_mem_size (src1, load_mode_size); |
8845cb37 AS |
1511 | |
1512 | if (!REG_P (XEXP (src2, 0))) | |
1513 | { | |
1514 | rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
1515 | src2 = replace_equiv_address (src2, src2_reg); | |
1516 | } | |
f4f867ca | 1517 | set_mem_size (src2, load_mode_size); |
8845cb37 AS |
1518 | |
1519 | do_load_for_compare (tmp_reg_src1, src1, load_mode); | |
1520 | do_load_for_compare (tmp_reg_src2, src2, load_mode); | |
1521 | ||
1522 | if (cmp_bytes < load_mode_size) | |
1523 | { | |
1524 | /* Shift unneeded bytes off. */ | |
1525 | rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes)); | |
1526 | if (word_mode == DImode) | |
1527 | { | |
1528 | emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
1529 | emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
1530 | } | |
1531 | else | |
1532 | { | |
1533 | emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
1534 | emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
1535 | } | |
1536 | } | |
1537 | ||
1538 | int remain = bytes - cmp_bytes; | |
1539 | if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode)) | |
1540 | { | |
1541 | /* Target is larger than load size so we don't need to | |
1542 | reduce result size. */ | |
1543 | ||
1544 | /* We previously did a block that need 64->32 conversion but | |
1545 | the current block does not, so a label is needed to jump | |
1546 | to the end. */ | |
1547 | if (generate_6432_conversion && !final_label) | |
1548 | final_label = gen_label_rtx (); | |
1549 | ||
1550 | if (remain > 0) | |
1551 | { | |
1552 | /* This is not the last block, branch to the end if the result | |
1553 | of this subtract is not zero. */ | |
1554 | if (!final_label) | |
1555 | final_label = gen_label_rtx (); | |
1556 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1557 | rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); | |
1558 | rtx cr = gen_reg_rtx (CCmode); | |
1559 | rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr); | |
1560 | emit_insn (gen_movsi (target, | |
1561 | gen_lowpart (SImode, tmp_reg_src2))); | |
1562 | rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx); | |
1563 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, | |
1564 | fin_ref, pc_rtx); | |
1565 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
1566 | JUMP_LABEL (j) = final_label; | |
1567 | LABEL_NUSES (final_label) += 1; | |
1568 | } | |
1569 | else | |
1570 | { | |
1571 | if (word_mode == DImode) | |
1572 | { | |
1573 | emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1, | |
1574 | tmp_reg_src2)); | |
1575 | emit_insn (gen_movsi (target, | |
1576 | gen_lowpart (SImode, tmp_reg_src2))); | |
1577 | } | |
1578 | else | |
1579 | emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2)); | |
1580 | ||
1581 | if (final_label) | |
1582 | { | |
1583 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1584 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
5ec3397e | 1585 | JUMP_LABEL (j) = final_label; |
8845cb37 AS |
1586 | LABEL_NUSES (final_label) += 1; |
1587 | emit_barrier (); | |
1588 | } | |
1589 | } | |
1590 | } | |
1591 | else | |
1592 | { | |
1593 | /* Do we need a 64->32 conversion block? We need the 64->32 | |
1594 | conversion even if target size == load_mode size because | |
1595 | the subtract generates one extra bit. */ | |
1596 | generate_6432_conversion = true; | |
1597 | ||
1598 | if (remain > 0) | |
1599 | { | |
1600 | if (!convert_label) | |
1601 | convert_label = gen_label_rtx (); | |
1602 | ||
1603 | /* Compare to zero and branch to convert_label if not zero. */ | |
1604 | rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label); | |
1605 | if (TARGET_P9_MISC) | |
1606 | { | |
1607 | /* Generate a compare, and convert with a setb later. */ | |
1608 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, | |
1609 | tmp_reg_src2); | |
1610 | emit_insn (gen_rtx_SET (cond, cmp)); | |
1611 | } | |
1612 | else | |
1613 | /* Generate a subfc. and use the longer | |
1614 | sequence for conversion. */ | |
1615 | if (TARGET_64BIT) | |
1616 | emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2, | |
1617 | tmp_reg_src1, cond)); | |
1618 | else | |
1619 | emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2, | |
1620 | tmp_reg_src1, cond)); | |
1621 | rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
1622 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, | |
1623 | cvt_ref, pc_rtx); | |
1624 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
5ec3397e | 1625 | JUMP_LABEL (j) = convert_label; |
8845cb37 AS |
1626 | LABEL_NUSES (convert_label) += 1; |
1627 | } | |
1628 | else | |
1629 | { | |
1630 | /* Just do the subtract/compare. Since this is the last block | |
1631 | the convert code will be generated immediately following. */ | |
1632 | if (TARGET_P9_MISC) | |
1633 | { | |
1634 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, | |
1635 | tmp_reg_src2); | |
1636 | emit_insn (gen_rtx_SET (cond, cmp)); | |
1637 | } | |
1638 | else | |
1639 | if (TARGET_64BIT) | |
1640 | emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2, | |
1641 | tmp_reg_src1)); | |
1642 | else | |
1643 | emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2, | |
1644 | tmp_reg_src1)); | |
1645 | } | |
1646 | } | |
1647 | ||
1648 | offset += cmp_bytes; | |
1649 | bytes -= cmp_bytes; | |
1650 | } | |
1651 | ||
1652 | if (generate_6432_conversion) | |
1653 | { | |
1654 | if (convert_label) | |
1655 | emit_label (convert_label); | |
1656 | ||
1657 | /* We need to produce DI result from sub, then convert to target SI | |
ef4adf1f | 1658 | while maintaining <0 / ==0 / >0 properties. This sequence works: |
8845cb37 AS |
1659 | subfc L,A,B |
1660 | subfe H,H,H | |
1661 | popcntd L,L | |
1662 | rldimi L,H,6,0 | |
1663 | ||
1664 | This is an alternate one Segher cooked up if somebody | |
1665 | wants to expand this for something that doesn't have popcntd: | |
1666 | subfc L,a,b | |
1667 | subfe H,x,x | |
1668 | addic t,L,-1 | |
1669 | subfe v,t,L | |
1670 | or z,v,H | |
1671 | ||
1672 | And finally, p9 can just do this: | |
1673 | cmpld A,B | |
1674 | setb r */ | |
1675 | ||
1676 | if (TARGET_P9_MISC) | |
1677 | { | |
1678 | emit_insn (gen_setb_unsigned (target, cond)); | |
1679 | } | |
1680 | else | |
1681 | { | |
1682 | if (TARGET_64BIT) | |
1683 | { | |
1684 | rtx tmp_reg_ca = gen_reg_rtx (DImode); | |
1685 | emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); | |
1686 | emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2)); | |
1687 | emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca)); | |
1688 | emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2))); | |
1689 | } | |
1690 | else | |
1691 | { | |
1692 | rtx tmp_reg_ca = gen_reg_rtx (SImode); | |
1693 | emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); | |
1694 | emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2)); | |
1695 | emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca)); | |
1696 | } | |
1697 | } | |
1698 | } | |
1699 | ||
1700 | if (final_label) | |
1701 | emit_label (final_label); | |
1702 | ||
1703 | gcc_assert (bytes == 0); | |
1704 | return true; | |
1705 | } | |
1706 | ||
f7e94dfb | 1707 | /* Generate page crossing check and branch code to set up for |
8845cb37 AS |
1708 | strncmp when we don't have DI alignment. |
1709 | STRNCMP_LABEL is the label to branch if there is a page crossing. | |
f7e94dfb | 1710 | SRC_ADDR is the string address to be examined. |
8845cb37 AS |
1711 | BYTES is the max number of bytes to compare. */ |
1712 | static void | |
f7e94dfb | 1713 | expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes) |
8845cb37 AS |
1714 | { |
1715 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label); | |
f7e94dfb AS |
1716 | rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr)); |
1717 | do_and3 (src_pgoff, src_addr, GEN_INT (0xfff)); | |
8845cb37 | 1718 | rtx cond = gen_reg_rtx (CCmode); |
f7e94dfb | 1719 | emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff, |
8845cb37 AS |
1720 | GEN_INT (4096 - bytes))); |
1721 | ||
0c791c59 | 1722 | rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx); |
8845cb37 AS |
1723 | |
1724 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
0c791c59 | 1725 | lab_ref, pc_rtx); |
8845cb37 AS |
1726 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); |
1727 | JUMP_LABEL (j) = strncmp_label; | |
1728 | LABEL_NUSES (strncmp_label) += 1; | |
1729 | } | |
1730 | ||
74f9986e AS |
1731 | /* Generate the sequence of compares for strcmp/strncmp using gpr instructions. |
1732 | BYTES_TO_COMPARE is the number of bytes to be compared. | |
1733 | BASE_ALIGN is the smaller of the alignment of the two strings. | |
1734 | ORIG_SRC1 is the unmodified rtx for the first string. | |
1735 | ORIG_SRC2 is the unmodified rtx for the second string. | |
1736 | TMP_REG_SRC1 is the register for loading the first string. | |
1737 | TMP_REG_SRC2 is the register for loading the second string. | |
1738 | RESULT_REG is the rtx for the result register. | |
1739 | EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call | |
1740 | to strcmp/strncmp if we have equality at the end of the inline comparison. | |
9d36bd3b AS |
1741 | P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code |
1742 | to clean up and generate the final comparison result. | |
ef4adf1f | 1743 | FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just |
74f9986e AS |
1744 | set the final result. */ |
1745 | static void | |
9d36bd3b AS |
1746 | expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, |
1747 | unsigned int base_align, | |
1748 | rtx orig_src1, rtx orig_src2, | |
1749 | rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg, | |
1750 | bool equality_compare_rest, rtx *p_cleanup_label, | |
1751 | rtx final_move_label) | |
74f9986e AS |
1752 | { |
1753 | unsigned int word_mode_size = GET_MODE_SIZE (word_mode); | |
1754 | machine_mode load_mode; | |
1755 | unsigned int load_mode_size; | |
1756 | unsigned HOST_WIDE_INT cmp_bytes = 0; | |
1757 | unsigned HOST_WIDE_INT offset = 0; | |
1758 | rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0)); | |
1759 | rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0)); | |
9d36bd3b AS |
1760 | gcc_assert (p_cleanup_label != NULL); |
1761 | rtx cleanup_label = *p_cleanup_label; | |
74f9986e AS |
1762 | |
1763 | while (bytes_to_compare > 0) | |
1764 | { | |
1765 | /* GPR compare sequence: | |
ef4adf1f AS |
1766 | check each 8B with: ld/ld/cmpb/cmpb/orc./bne |
1767 | ||
74f9986e | 1768 | cleanup code at end: |
74f9986e AS |
1769 | cntlzd get bit of first zero/diff byte |
1770 | subfic convert for rldcl use | |
1771 | rldcl rldcl extract diff/zero byte | |
1772 | subf subtract for final result | |
1773 | ||
1774 | The last compare can branch around the cleanup code if the | |
1775 | result is zero because the strings are exactly equal. */ | |
ef4adf1f | 1776 | |
74f9986e AS |
1777 | unsigned int align = compute_current_alignment (base_align, offset); |
1778 | load_mode = select_block_compare_mode (offset, bytes_to_compare, align); | |
1779 | load_mode_size = GET_MODE_SIZE (load_mode); | |
1780 | if (bytes_to_compare >= load_mode_size) | |
1781 | cmp_bytes = load_mode_size; | |
1782 | else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
1783 | { | |
1784 | /* Move this load back so it doesn't go past the end. | |
1785 | P8/P9 can do this efficiently. */ | |
1786 | unsigned int extra_bytes = load_mode_size - bytes_to_compare; | |
1787 | cmp_bytes = bytes_to_compare; | |
1788 | if (extra_bytes < offset) | |
1789 | { | |
1790 | offset -= extra_bytes; | |
1791 | cmp_bytes = load_mode_size; | |
1792 | bytes_to_compare = cmp_bytes; | |
1793 | } | |
1794 | } | |
1795 | else | |
1796 | /* P7 and earlier can't do the overlapping load trick fast, | |
1797 | so this forces a non-overlapping load and a shift to get | |
1798 | rid of the extra bytes. */ | |
1799 | cmp_bytes = bytes_to_compare; | |
1800 | ||
ef4adf1f AS |
1801 | rtx offset_reg = gen_reg_rtx (Pmode); |
1802 | emit_move_insn (offset_reg, GEN_INT (offset)); | |
1803 | ||
1804 | rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_reg); | |
74f9986e | 1805 | do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1); |
ef4adf1f | 1806 | rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_reg); |
74f9986e AS |
1807 | do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2); |
1808 | ||
1809 | /* We must always left-align the data we read, and | |
1810 | clear any bytes to the right that are beyond the string. | |
1811 | Otherwise the cmpb sequence won't produce the correct | |
ef4adf1f AS |
1812 | results. However if there is only one byte left, we |
1813 | can just subtract to get the final result so the shifts | |
1814 | and clears are not needed. */ | |
74f9986e | 1815 | |
ef4adf1f | 1816 | unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; |
74f9986e | 1817 | |
ef4adf1f AS |
1818 | /* Loading just a single byte is a special case. If we are |
1819 | loading more than that, we have to check whether we are | |
1820 | looking at the entire chunk of data. If not, rotate left and | |
1821 | clear right so that bytes we aren't supposed to look at are | |
1822 | zeroed, and the first byte we are supposed to compare is | |
1823 | leftmost. */ | |
1824 | if (load_mode_size != 1) | |
74f9986e | 1825 | { |
ef4adf1f AS |
1826 | if (load_mode_size < word_mode_size) |
1827 | { | |
1828 | /* Rotate left first. */ | |
1829 | rtx sh = GEN_INT (BITS_PER_UNIT | |
1830 | * (word_mode_size - load_mode_size)); | |
1831 | do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh); | |
1832 | do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh); | |
1833 | } | |
1834 | ||
1835 | if (cmp_bytes < word_mode_size) | |
1836 | { | |
1837 | /* Now clear right. This plus the rotate can be | |
1838 | turned into a rldicr instruction. */ | |
1839 | HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes); | |
1840 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); | |
1841 | do_and3 (tmp_reg_src1, tmp_reg_src1, mask); | |
1842 | do_and3 (tmp_reg_src2, tmp_reg_src2, mask); | |
1843 | } | |
74f9986e AS |
1844 | } |
1845 | ||
1846 | /* Cases to handle. A and B are chunks of the two strings. | |
1847 | 1: Not end of comparison: | |
1848 | A != B: branch to cleanup code to compute result. | |
1849 | A == B: check for 0 byte, next block if not found. | |
1850 | 2: End of the inline comparison: | |
1851 | A != B: branch to cleanup code to compute result. | |
1852 | A == B: check for 0 byte, call strcmp/strncmp | |
1853 | 3: compared requested N bytes: | |
1854 | A == B: branch to result 0. | |
1855 | A != B: cleanup code to compute result. */ | |
1856 | ||
74f9986e AS |
1857 | rtx dst_label; |
1858 | if (remain > 0 || equality_compare_rest) | |
1859 | { | |
1860 | /* Branch to cleanup code, otherwise fall through to do | |
1861 | more compares. */ | |
1862 | if (!cleanup_label) | |
1863 | cleanup_label = gen_label_rtx (); | |
1864 | dst_label = cleanup_label; | |
1865 | } | |
1866 | else | |
1867 | /* Branch to end and produce result of 0. */ | |
1868 | dst_label = final_move_label; | |
1869 | ||
ef4adf1f AS |
1870 | if (load_mode_size == 1) |
1871 | { | |
1872 | /* Special case for comparing just single byte. */ | |
1873 | if (equality_compare_rest) | |
1874 | { | |
1875 | /* Use subf./bne to branch to final_move_label if the | |
1876 | byte differs, otherwise fall through to the strncmp | |
1877 | call. We must also check for a zero byte here as we | |
1878 | must not make the library call if this is the end of | |
1879 | the string. */ | |
1880 | ||
1881 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); | |
1882 | rtx cond = gen_reg_rtx (CCmode); | |
1883 | rtx diff_rtx = gen_rtx_MINUS (word_mode, | |
1884 | tmp_reg_src1, tmp_reg_src2); | |
1885 | rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond); | |
1886 | rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
1887 | ||
1888 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
1889 | lab_ref, pc_rtx); | |
1890 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
1891 | JUMP_LABEL (j) = final_move_label; | |
1892 | LABEL_NUSES (final_move_label) += 1; | |
74f9986e | 1893 | |
ef4adf1f AS |
1894 | /* Check for zero byte here before fall through to |
1895 | library call. This catches the case where the | |
1896 | strings are equal and end in a zero byte at this | |
1897 | position. */ | |
74f9986e | 1898 | |
ef4adf1f AS |
1899 | rtx cond0 = gen_reg_rtx (CCmode); |
1900 | emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1, | |
1901 | const0_rtx)); | |
74f9986e | 1902 | |
ef4adf1f | 1903 | rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx); |
74f9986e | 1904 | |
ef4adf1f AS |
1905 | rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx, |
1906 | lab_ref, pc_rtx); | |
1907 | rtx j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0)); | |
1908 | JUMP_LABEL (j0) = final_move_label; | |
1909 | LABEL_NUSES (final_move_label) += 1; | |
1910 | } | |
1911 | else | |
1912 | { | |
1913 | /* This is the last byte to be compared so we can use | |
1914 | subf to compute the final result and branch | |
1915 | unconditionally to final_move_label. */ | |
1916 | ||
1917 | do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2); | |
1918 | ||
1919 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); | |
1920 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
1921 | JUMP_LABEL (j) = final_move_label; | |
1922 | LABEL_NUSES (final_move_label) += 1; | |
1923 | emit_barrier (); | |
1924 | } | |
1925 | } | |
1926 | else | |
74f9986e | 1927 | { |
74f9986e | 1928 | rtx cmpb_zero = gen_reg_rtx (word_mode); |
ef4adf1f | 1929 | rtx cmpb_diff = gen_reg_rtx (word_mode); |
74f9986e | 1930 | rtx zero_reg = gen_reg_rtx (word_mode); |
ef4adf1f AS |
1931 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); |
1932 | rtx cond = gen_reg_rtx (CCmode); | |
1933 | ||
74f9986e | 1934 | emit_move_insn (zero_reg, GEN_INT (0)); |
ef4adf1f | 1935 | do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2); |
74f9986e | 1936 | do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg); |
ef4adf1f AS |
1937 | rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff); |
1938 | rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero); | |
74f9986e | 1939 | |
ef4adf1f | 1940 | rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond); |
74f9986e | 1941 | |
ef4adf1f AS |
1942 | rtx cmp_rtx; |
1943 | if (remain == 0 && !equality_compare_rest) | |
1944 | cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx); | |
1945 | else | |
1946 | cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
74f9986e | 1947 | |
ef4adf1f AS |
1948 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, |
1949 | lab_ref, pc_rtx); | |
1950 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
1951 | JUMP_LABEL (j) = dst_label; | |
1952 | LABEL_NUSES (dst_label) += 1; | |
74f9986e AS |
1953 | } |
1954 | ||
1955 | offset += cmp_bytes; | |
1956 | bytes_to_compare -= cmp_bytes; | |
1957 | } | |
1958 | ||
9d36bd3b AS |
1959 | *p_cleanup_label = cleanup_label; |
1960 | return; | |
1961 | } | |
1962 | ||
ef4adf1f | 1963 | /* Generate the sequence of compares for strcmp/strncmp using vec/vsx |
9d36bd3b AS |
1964 | instructions. |
1965 | ||
1966 | BYTES_TO_COMPARE is the number of bytes to be compared. | |
1967 | ORIG_SRC1 is the unmodified rtx for the first string. | |
1968 | ORIG_SRC2 is the unmodified rtx for the second string. | |
1969 | S1ADDR is the register to use for the base address of the first string. | |
1970 | S2ADDR is the register to use for the base address of the second string. | |
1971 | OFF_REG is the register to use for the string offset for loads. | |
1972 | S1DATA is the register for loading the first string. | |
1973 | S2DATA is the register for loading the second string. | |
1974 | VEC_RESULT is the rtx for the vector result indicating the byte difference. | |
1975 | EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call | |
1976 | to strcmp/strncmp if we have equality at the end of the inline comparison. | |
1977 | P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code to clean up | |
1978 | and generate the final comparison result. | |
ef4adf1f | 1979 | FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just |
9d36bd3b AS |
1980 | set the final result. */ |
1981 | static void | |
1982 | expand_strncmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare, | |
1983 | rtx orig_src1, rtx orig_src2, | |
1984 | rtx s1addr, rtx s2addr, rtx off_reg, | |
1985 | rtx s1data, rtx s2data, | |
1986 | rtx vec_result, bool equality_compare_rest, | |
1987 | rtx *p_cleanup_label, rtx final_move_label) | |
1988 | { | |
1989 | machine_mode load_mode; | |
1990 | unsigned int load_mode_size; | |
1991 | unsigned HOST_WIDE_INT cmp_bytes = 0; | |
1992 | unsigned HOST_WIDE_INT offset = 0; | |
1993 | ||
1994 | gcc_assert (p_cleanup_label != NULL); | |
1995 | rtx cleanup_label = *p_cleanup_label; | |
1996 | ||
1997 | emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0))); | |
1998 | emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0))); | |
1999 | ||
2000 | unsigned int i; | |
2001 | rtx zr[16]; | |
2002 | for (i = 0; i < 16; i++) | |
2003 | zr[i] = GEN_INT (0); | |
2004 | rtvec zv = gen_rtvec_v (16, zr); | |
2005 | rtx zero_reg = gen_reg_rtx (V16QImode); | |
2006 | rs6000_expand_vector_init (zero_reg, gen_rtx_PARALLEL (V16QImode, zv)); | |
2007 | ||
2008 | while (bytes_to_compare > 0) | |
2009 | { | |
2010 | /* VEC/VSX compare sequence for P8: | |
2011 | check each 16B with: | |
2012 | lxvd2x 32,28,8 | |
2013 | lxvd2x 33,29,8 | |
2014 | vcmpequb 2,0,1 # compare strings | |
2015 | vcmpequb 4,0,3 # compare w/ 0 | |
2016 | xxlorc 37,36,34 # first FF byte is either mismatch or end of string | |
2017 | vcmpequb. 7,5,3 # reg 7 contains 0 | |
2018 | bnl 6,.Lmismatch | |
2019 | ||
2020 | For the P8 LE case, we use lxvd2x and compare full 16 bytes | |
2021 | but then use use vgbbd and a shift to get two bytes with the | |
2022 | information we need in the correct order. | |
2023 | ||
2024 | VEC/VSX compare sequence if TARGET_P9_VECTOR: | |
2025 | lxvb16x/lxvb16x # load 16B of each string | |
2026 | vcmpnezb. # produces difference location or zero byte location | |
2027 | bne 6,.Lmismatch | |
2028 | ||
2029 | Use the overlapping compare trick for the last block if it is | |
ef4adf1f | 2030 | less than 16 bytes. |
9d36bd3b AS |
2031 | */ |
2032 | ||
2033 | load_mode = V16QImode; | |
2034 | load_mode_size = GET_MODE_SIZE (load_mode); | |
ef4adf1f | 2035 | |
9d36bd3b AS |
2036 | if (bytes_to_compare >= load_mode_size) |
2037 | cmp_bytes = load_mode_size; | |
2038 | else | |
2039 | { | |
2040 | /* Move this load back so it doesn't go past the end. P8/P9 | |
2041 | can do this efficiently. This is never called with less | |
2042 | than 16 bytes so we should always be able to do this. */ | |
2043 | unsigned int extra_bytes = load_mode_size - bytes_to_compare; | |
2044 | cmp_bytes = bytes_to_compare; | |
2045 | gcc_assert (offset > extra_bytes); | |
2046 | offset -= extra_bytes; | |
2047 | cmp_bytes = load_mode_size; | |
2048 | bytes_to_compare = cmp_bytes; | |
2049 | } | |
2050 | ||
2051 | /* The offset currently used is always kept in off_reg so that the | |
2052 | cleanup code on P8 can use it to extract the differing byte. */ | |
2053 | emit_move_insn (off_reg, GEN_INT (offset)); | |
2054 | ||
2055 | rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg); | |
2056 | do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1); | |
2057 | rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg); | |
2058 | do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2); | |
2059 | ||
2060 | /* Cases to handle. A and B are chunks of the two strings. | |
2061 | 1: Not end of comparison: | |
2062 | A != B: branch to cleanup code to compute result. | |
2063 | A == B: next block | |
2064 | 2: End of the inline comparison: | |
2065 | A != B: branch to cleanup code to compute result. | |
2066 | A == B: call strcmp/strncmp | |
2067 | 3: compared requested N bytes: | |
2068 | A == B: branch to result 0. | |
2069 | A != B: cleanup code to compute result. */ | |
2070 | ||
2071 | unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; | |
2072 | ||
2073 | if (TARGET_P9_VECTOR) | |
2074 | emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data)); | |
2075 | else | |
2076 | { | |
2077 | /* Emit instructions to do comparison and zero check. */ | |
2078 | rtx cmp_res = gen_reg_rtx (load_mode); | |
2079 | rtx cmp_zero = gen_reg_rtx (load_mode); | |
2080 | rtx cmp_combined = gen_reg_rtx (load_mode); | |
2081 | emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data)); | |
2082 | emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg)); | |
2083 | emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res)); | |
2084 | emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg)); | |
2085 | } | |
2086 | ||
2087 | bool branch_to_cleanup = (remain > 0 || equality_compare_rest); | |
2088 | rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO); | |
2089 | rtx dst_label; | |
2090 | rtx cmp_rtx; | |
2091 | if (branch_to_cleanup) | |
2092 | { | |
2093 | /* Branch to cleanup code, otherwise fall through to do more | |
ef4adf1f | 2094 | compares. P8 and P9 use different CR bits because on P8 |
9d36bd3b AS |
2095 | we are looking at the result of a comparsion vs a |
2096 | register of zeroes so the all-true condition means no | |
ef4adf1f | 2097 | difference or zero was found. On P9, vcmpnezb sets a byte |
9d36bd3b AS |
2098 | to 0xff if there is a mismatch or zero, so the all-false |
2099 | condition indicates we found no difference or zero. */ | |
2100 | if (!cleanup_label) | |
2101 | cleanup_label = gen_label_rtx (); | |
2102 | dst_label = cleanup_label; | |
2103 | if (TARGET_P9_VECTOR) | |
2104 | cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx); | |
2105 | else | |
2106 | cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx); | |
2107 | } | |
2108 | else | |
2109 | { | |
ef4adf1f | 2110 | /* Branch to final return or fall through to cleanup, |
9d36bd3b AS |
2111 | result is already set to 0. */ |
2112 | dst_label = final_move_label; | |
2113 | if (TARGET_P9_VECTOR) | |
2114 | cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx); | |
2115 | else | |
2116 | cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx); | |
2117 | } | |
2118 | ||
2119 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); | |
2120 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
2121 | lab_ref, pc_rtx); | |
2122 | rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
2123 | JUMP_LABEL (j2) = dst_label; | |
2124 | LABEL_NUSES (dst_label) += 1; | |
2125 | ||
2126 | offset += cmp_bytes; | |
2127 | bytes_to_compare -= cmp_bytes; | |
2128 | } | |
2129 | *p_cleanup_label = cleanup_label; | |
2130 | return; | |
74f9986e AS |
2131 | } |
2132 | ||
f7e94dfb AS |
2133 | /* Generate the final sequence that identifies the differing |
2134 | byte and generates the final result, taking into account | |
2135 | zero bytes: | |
ef4adf1f | 2136 | |
f7e94dfb AS |
2137 | cntlzd get bit of first zero/diff byte |
2138 | addi convert for rldcl use | |
2139 | rldcl rldcl extract diff/zero byte | |
2140 | subf subtract for final result | |
2141 | ||
2142 | STR1 is the reg rtx for data from string 1. | |
2143 | STR2 is the reg rtx for data from string 2. | |
2144 | RESULT is the reg rtx for the comparison result. */ | |
2145 | ||
2146 | static void | |
2147 | emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result) | |
2148 | { | |
2149 | machine_mode m = GET_MODE (str1); | |
f7e94dfb | 2150 | rtx rot_amt = gen_reg_rtx (m); |
f7e94dfb AS |
2151 | |
2152 | rtx rot1_1 = gen_reg_rtx (m); | |
2153 | rtx rot1_2 = gen_reg_rtx (m); | |
2154 | rtx rot2_1 = gen_reg_rtx (m); | |
2155 | rtx rot2_2 = gen_reg_rtx (m); | |
2156 | ||
2157 | if (m == SImode) | |
2158 | { | |
ef4adf1f | 2159 | emit_insn (gen_clzsi2 (rot_amt, result)); |
f7e94dfb AS |
2160 | emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8))); |
2161 | emit_insn (gen_rotlsi3 (rot1_1, str1, | |
2162 | gen_lowpart (SImode, rot_amt))); | |
2163 | emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); | |
2164 | emit_insn (gen_rotlsi3 (rot2_1, str2, | |
2165 | gen_lowpart (SImode, rot_amt))); | |
2166 | emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); | |
2167 | emit_insn (gen_subsi3 (result, rot1_2, rot2_2)); | |
2168 | } | |
2169 | else if (m == DImode) | |
2170 | { | |
ef4adf1f | 2171 | emit_insn (gen_clzdi2 (rot_amt, result)); |
f7e94dfb AS |
2172 | emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8))); |
2173 | emit_insn (gen_rotldi3 (rot1_1, str1, | |
2174 | gen_lowpart (SImode, rot_amt))); | |
2175 | emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); | |
2176 | emit_insn (gen_rotldi3 (rot2_1, str2, | |
2177 | gen_lowpart (SImode, rot_amt))); | |
2178 | emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); | |
2179 | emit_insn (gen_subdi3 (result, rot1_2, rot2_2)); | |
2180 | } | |
2181 | else | |
2182 | gcc_unreachable (); | |
ef4adf1f | 2183 | |
f7e94dfb AS |
2184 | return; |
2185 | } | |
2186 | ||
9d36bd3b AS |
2187 | /* Generate the final sequence that identifies the differing |
2188 | byte and generates the final result, taking into account | |
2189 | zero bytes: | |
2190 | ||
2191 | P8: | |
2192 | vgbbd 0,0 | |
2193 | vsldoi 0,0,0,9 | |
2194 | mfvsrd 9,32 | |
2195 | addi 10,9,-1 # count trailing zero bits | |
2196 | andc 9,10,9 | |
2197 | popcntd 9,9 | |
2198 | lbzx 10,28,9 # use that offset to load differing byte | |
2199 | lbzx 3,29,9 | |
2200 | subf 3,3,10 # subtract for final result | |
ef4adf1f | 2201 | |
9d36bd3b AS |
2202 | P9: |
2203 | vclzlsbb # counts trailing bytes with lsb=0 | |
ef4adf1f | 2204 | vextublx # extract differing byte |
9d36bd3b AS |
2205 | |
2206 | STR1 is the reg rtx for data from string 1. | |
2207 | STR2 is the reg rtx for data from string 2. | |
2208 | RESULT is the reg rtx for the comparison result. | |
2209 | S1ADDR is the register to use for the base address of the first string. | |
2210 | S2ADDR is the register to use for the base address of the second string. | |
2211 | ORIG_SRC1 is the unmodified rtx for the first string. | |
2212 | ORIG_SRC2 is the unmodified rtx for the second string. | |
2213 | OFF_REG is the register to use for the string offset for loads. | |
2214 | VEC_RESULT is the rtx for the vector result indicating the byte difference. | |
2215 | */ | |
2216 | ||
2217 | static void | |
2218 | emit_final_str_compare_vec (rtx str1, rtx str2, rtx result, | |
2219 | rtx s1addr, rtx s2addr, | |
2220 | rtx orig_src1, rtx orig_src2, | |
2221 | rtx off_reg, rtx vec_result) | |
2222 | { | |
2223 | if (TARGET_P9_VECTOR) | |
2224 | { | |
2225 | rtx diffix = gen_reg_rtx (SImode); | |
2226 | rtx chr1 = gen_reg_rtx (SImode); | |
2227 | rtx chr2 = gen_reg_rtx (SImode); | |
2228 | rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0); | |
2229 | rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0); | |
2230 | emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result)); | |
2231 | emit_insn (gen_vextublx (chr1, diffix, str1)); | |
2232 | emit_insn (gen_vextublx (chr2, diffix, str2)); | |
2233 | do_sub3 (result, chr1_di, chr2_di); | |
2234 | } | |
2235 | else | |
2236 | { | |
6bd2b8ec | 2237 | gcc_assert (TARGET_P8_VECTOR); |
9d36bd3b AS |
2238 | rtx diffix = gen_reg_rtx (DImode); |
2239 | rtx result_gbbd = gen_reg_rtx (V16QImode); | |
ef4adf1f | 2240 | /* Since each byte of the input is either 00 or FF, the bytes in |
9d36bd3b AS |
2241 | dw0 and dw1 after vgbbd are all identical to each other. */ |
2242 | emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result)); | |
2243 | /* For LE, we shift by 9 and get BA in the low two bytes then CTZ. | |
2244 | For BE, we shift by 7 and get AB in the high two bytes then CLZ. */ | |
2245 | rtx result_shifted = gen_reg_rtx (V16QImode); | |
2246 | int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9; | |
2247 | emit_insn (gen_altivec_vsldoi_v16qi (result_shifted,result_gbbd,result_gbbd, GEN_INT (shift_amt))); | |
2248 | ||
2249 | rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0); | |
2250 | emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted)); | |
2251 | rtx count = gen_reg_rtx (DImode); | |
2252 | ||
2253 | if (BYTES_BIG_ENDIAN) | |
2254 | emit_insn (gen_clzdi2 (count, diffix)); | |
2255 | else | |
2256 | emit_insn (gen_ctzdi2 (count, diffix)); | |
2257 | ||
ef4adf1f | 2258 | /* P8 doesn't have a good solution for extracting one byte from |
9d36bd3b AS |
2259 | a vsx reg like vextublx on P9 so we just compute the offset |
2260 | of the differing byte and load it from each string. */ | |
2261 | do_add3 (off_reg, off_reg, count); | |
2262 | ||
2263 | rtx chr1 = gen_reg_rtx (QImode); | |
2264 | rtx chr2 = gen_reg_rtx (QImode); | |
2265 | rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg); | |
2266 | do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1); | |
2267 | rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg); | |
2268 | do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2); | |
2269 | machine_mode rmode = GET_MODE (result); | |
2270 | rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0); | |
2271 | rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0); | |
2272 | do_sub3 (result, chr1_rm, chr2_rm); | |
2273 | } | |
2274 | ||
2275 | return; | |
2276 | } | |
2277 | ||
8845cb37 | 2278 | /* Expand a string compare operation with length, and return |
ef4adf1f | 2279 | true if successful. Return false if we should let the |
8845cb37 AS |
2280 | compiler generate normal code, probably a strncmp call. |
2281 | ||
2282 | OPERANDS[0] is the target (result). | |
2283 | OPERANDS[1] is the first source. | |
2284 | OPERANDS[2] is the second source. | |
2285 | If NO_LENGTH is zero, then: | |
2286 | OPERANDS[3] is the length. | |
2287 | OPERANDS[4] is the alignment in bytes. | |
2288 | If NO_LENGTH is nonzero, then: | |
2289 | OPERANDS[3] is the alignment in bytes. */ | |
2290 | bool | |
2291 | expand_strn_compare (rtx operands[], int no_length) | |
2292 | { | |
2293 | rtx target = operands[0]; | |
2294 | rtx orig_src1 = operands[1]; | |
2295 | rtx orig_src2 = operands[2]; | |
2296 | rtx bytes_rtx, align_rtx; | |
2297 | if (no_length) | |
2298 | { | |
2299 | bytes_rtx = NULL; | |
2300 | align_rtx = operands[3]; | |
2301 | } | |
2302 | else | |
2303 | { | |
2304 | bytes_rtx = operands[3]; | |
2305 | align_rtx = operands[4]; | |
2306 | } | |
74f9986e | 2307 | |
f7e94dfb AS |
2308 | rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0)); |
2309 | rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0)); | |
8845cb37 | 2310 | |
ef4adf1f | 2311 | /* If we have a length, it must be constant. This simplifies things |
8845cb37 | 2312 | a bit as we don't have to generate code to check if we've exceeded |
ef4adf1f | 2313 | the length. Later this could be expanded to handle this case. */ |
8845cb37 AS |
2314 | if (!no_length && !CONST_INT_P (bytes_rtx)) |
2315 | return false; | |
2316 | ||
2317 | /* This must be a fixed size alignment. */ | |
2318 | if (!CONST_INT_P (align_rtx)) | |
2319 | return false; | |
2320 | ||
2321 | unsigned int base_align = UINTVAL (align_rtx); | |
f7e94dfb AS |
2322 | unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; |
2323 | unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; | |
8845cb37 | 2324 | |
e0bd6c9f RS |
2325 | /* targetm.slow_unaligned_access -- don't do unaligned stuff. */ |
2326 | if (targetm.slow_unaligned_access (word_mode, align1) | |
2327 | || targetm.slow_unaligned_access (word_mode, align2)) | |
8845cb37 AS |
2328 | return false; |
2329 | ||
2330 | gcc_assert (GET_MODE (target) == SImode); | |
2331 | ||
9d36bd3b | 2332 | unsigned int required_align = 8; |
8845cb37 AS |
2333 | |
2334 | unsigned HOST_WIDE_INT offset = 0; | |
2335 | unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */ | |
2336 | unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */ | |
9d36bd3b | 2337 | |
8845cb37 | 2338 | if (no_length) |
9d36bd3b | 2339 | bytes = rs6000_string_compare_inline_limit; |
8845cb37 AS |
2340 | else |
2341 | bytes = UINTVAL (bytes_rtx); | |
2342 | ||
ef4adf1f | 2343 | /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at |
9d36bd3b AS |
2344 | least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is |
2345 | at least POWER8. That way we can rely on overlapping compares to | |
6bd2b8ec AS |
2346 | do the final comparison of less than 16 bytes. Also I do not |
2347 | want to deal with making this work for 32 bits. In addition, we | |
2348 | have to make sure that we have at least P8_VECTOR (we don't allow | |
2349 | P9_VECTOR without P8_VECTOR). */ | |
2350 | int use_vec = (bytes >= 16 && !TARGET_32BIT | |
2351 | && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR); | |
9d36bd3b AS |
2352 | |
2353 | if (use_vec) | |
2354 | required_align = 16; | |
2355 | ||
2356 | machine_mode load_mode; | |
2357 | rtx tmp_reg_src1, tmp_reg_src2; | |
2358 | if (use_vec) | |
2359 | { | |
2360 | load_mode = V16QImode; | |
2361 | tmp_reg_src1 = gen_reg_rtx (V16QImode); | |
2362 | tmp_reg_src2 = gen_reg_rtx (V16QImode); | |
2363 | } | |
2364 | else | |
2365 | { | |
2366 | load_mode = select_block_compare_mode (0, bytes, base_align); | |
2367 | tmp_reg_src1 = gen_reg_rtx (word_mode); | |
2368 | tmp_reg_src2 = gen_reg_rtx (word_mode); | |
2369 | } | |
2370 | ||
2371 | compare_length = rs6000_string_compare_inline_limit; | |
8845cb37 AS |
2372 | |
2373 | /* If we have equality at the end of the last compare and we have not | |
2374 | found the end of the string, we need to call strcmp/strncmp to | |
2375 | compare the remainder. */ | |
2376 | bool equality_compare_rest = false; | |
2377 | ||
2378 | if (no_length) | |
2379 | { | |
2380 | bytes = compare_length; | |
2381 | equality_compare_rest = true; | |
2382 | } | |
2383 | else | |
2384 | { | |
2385 | if (bytes <= compare_length) | |
2386 | compare_length = bytes; | |
2387 | else | |
2388 | equality_compare_rest = true; | |
2389 | } | |
2390 | ||
2391 | rtx result_reg = gen_reg_rtx (word_mode); | |
2392 | rtx final_move_label = gen_label_rtx (); | |
2393 | rtx final_label = gen_label_rtx (); | |
2394 | rtx begin_compare_label = NULL; | |
ef4adf1f | 2395 | |
f7e94dfb | 2396 | if (base_align < required_align) |
8845cb37 AS |
2397 | { |
2398 | /* Generate code that checks distance to 4k boundary for this case. */ | |
2399 | begin_compare_label = gen_label_rtx (); | |
2400 | rtx strncmp_label = gen_label_rtx (); | |
2401 | rtx jmp; | |
2402 | ||
2403 | /* Strncmp for power8 in glibc does this: | |
5ec3397e AS |
2404 | rldicl r8,r3,0,52 |
2405 | cmpldi cr7,r8,4096-16 | |
2406 | bgt cr7,L(pagecross) */ | |
8845cb37 AS |
2407 | |
2408 | /* Make sure that the length we use for the alignment test and | |
2409 | the subsequent code generation are in agreement so we do not | |
2410 | go past the length we tested for a 4k boundary crossing. */ | |
2411 | unsigned HOST_WIDE_INT align_test = compare_length; | |
9d36bd3b | 2412 | if (align_test < required_align) |
8845cb37 AS |
2413 | { |
2414 | align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test); | |
2415 | base_align = align_test; | |
2416 | } | |
2417 | else | |
2418 | { | |
f7e94dfb AS |
2419 | align_test = ROUND_UP (align_test, required_align); |
2420 | base_align = required_align; | |
8845cb37 AS |
2421 | } |
2422 | ||
f7e94dfb AS |
2423 | if (align1 < required_align) |
2424 | expand_strncmp_align_check (strncmp_label, src1_addr, align_test); | |
2425 | if (align2 < required_align) | |
2426 | expand_strncmp_align_check (strncmp_label, src2_addr, align_test); | |
8845cb37 AS |
2427 | |
2428 | /* Now generate the following sequence: | |
2429 | - branch to begin_compare | |
2430 | - strncmp_label | |
2431 | - call to strncmp | |
2432 | - branch to final_label | |
2433 | - begin_compare_label */ | |
2434 | ||
2435 | rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label); | |
2436 | jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref)); | |
2437 | JUMP_LABEL (jmp) = begin_compare_label; | |
2438 | LABEL_NUSES (begin_compare_label) += 1; | |
2439 | emit_barrier (); | |
2440 | ||
2441 | emit_label (strncmp_label); | |
2442 | ||
8845cb37 AS |
2443 | if (no_length) |
2444 | { | |
2445 | tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); | |
2446 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2447 | target, LCT_NORMAL, GET_MODE (target), |
f7e94dfb AS |
2448 | force_reg (Pmode, src1_addr), Pmode, |
2449 | force_reg (Pmode, src2_addr), Pmode); | |
8845cb37 AS |
2450 | } |
2451 | else | |
2452 | { | |
2453 | /* -m32 -mpowerpc64 results in word_mode being DImode even | |
9d36bd3b | 2454 | though otherwise it is 32-bit. The length arg to strncmp |
8845cb37 | 2455 | is a size_t which will be the same size as pointers. */ |
e9727bda AS |
2456 | rtx len_rtx = gen_reg_rtx (Pmode); |
2457 | emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode)); | |
8845cb37 AS |
2458 | |
2459 | tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); | |
2460 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2461 | target, LCT_NORMAL, GET_MODE (target), |
f7e94dfb AS |
2462 | force_reg (Pmode, src1_addr), Pmode, |
2463 | force_reg (Pmode, src2_addr), Pmode, | |
e9727bda | 2464 | len_rtx, Pmode); |
8845cb37 AS |
2465 | } |
2466 | ||
2467 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
2468 | jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
2469 | JUMP_LABEL (jmp) = final_label; | |
2470 | LABEL_NUSES (final_label) += 1; | |
2471 | emit_barrier (); | |
2472 | emit_label (begin_compare_label); | |
2473 | } | |
2474 | ||
2475 | rtx cleanup_label = NULL; | |
9d36bd3b | 2476 | rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL; |
8845cb37 | 2477 | |
f7e94dfb | 2478 | /* Generate a sequence of GPR or VEC/VSX instructions to compare out |
8845cb37 | 2479 | to the length specified. */ |
9d36bd3b AS |
2480 | if (use_vec) |
2481 | { | |
2482 | s1addr = gen_reg_rtx (Pmode); | |
2483 | s2addr = gen_reg_rtx (Pmode); | |
2484 | off_reg = gen_reg_rtx (Pmode); | |
2485 | vec_result = gen_reg_rtx (load_mode); | |
2486 | emit_move_insn (result_reg, GEN_INT (0)); | |
2487 | expand_strncmp_vec_sequence (compare_length, | |
2488 | orig_src1, orig_src2, | |
2489 | s1addr, s2addr, off_reg, | |
2490 | tmp_reg_src1, tmp_reg_src2, | |
2491 | vec_result, | |
2492 | equality_compare_rest, | |
2493 | &cleanup_label, final_move_label); | |
2494 | } | |
2495 | else | |
2496 | expand_strncmp_gpr_sequence (compare_length, base_align, | |
2497 | orig_src1, orig_src2, | |
2498 | tmp_reg_src1, tmp_reg_src2, | |
2499 | result_reg, | |
2500 | equality_compare_rest, | |
2501 | &cleanup_label, final_move_label); | |
74f9986e AS |
2502 | |
2503 | offset = compare_length; | |
ef4adf1f | 2504 | |
8845cb37 AS |
2505 | if (equality_compare_rest) |
2506 | { | |
2507 | /* Update pointers past what has been compared already. */ | |
f7e94dfb AS |
2508 | rtx src1 = force_reg (Pmode, |
2509 | gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset))); | |
2510 | rtx src2 = force_reg (Pmode, | |
2511 | gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset))); | |
8845cb37 AS |
2512 | |
2513 | /* Construct call to strcmp/strncmp to compare the rest of the string. */ | |
2514 | if (no_length) | |
2515 | { | |
2516 | tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); | |
2517 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2518 | target, LCT_NORMAL, GET_MODE (target), |
f7e94dfb | 2519 | src1, Pmode, src2, Pmode); |
8845cb37 AS |
2520 | } |
2521 | else | |
2522 | { | |
e9727bda AS |
2523 | rtx len_rtx = gen_reg_rtx (Pmode); |
2524 | emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode)); | |
8845cb37 AS |
2525 | tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); |
2526 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2527 | target, LCT_NORMAL, GET_MODE (target), |
e9727bda | 2528 | src1, Pmode, src2, Pmode, len_rtx, Pmode); |
8845cb37 AS |
2529 | } |
2530 | ||
2531 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
2532 | rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
2533 | JUMP_LABEL (jmp) = final_label; | |
2534 | LABEL_NUSES (final_label) += 1; | |
2535 | emit_barrier (); | |
2536 | } | |
2537 | ||
2538 | if (cleanup_label) | |
2539 | emit_label (cleanup_label); | |
2540 | ||
9d36bd3b AS |
2541 | if (use_vec) |
2542 | emit_final_str_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg, | |
2543 | s1addr, s2addr, orig_src1, orig_src2, | |
2544 | off_reg, vec_result); | |
2545 | else | |
2546 | emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg); | |
8845cb37 AS |
2547 | |
2548 | emit_label (final_move_label); | |
2549 | emit_insn (gen_movsi (target, | |
2550 | gen_lowpart (SImode, result_reg))); | |
2551 | emit_label (final_label); | |
2552 | return true; | |
2553 | } | |
2554 | ||
2555 | /* Expand a block move operation, and return 1 if successful. Return 0 | |
2556 | if we should let the compiler generate normal code. | |
2557 | ||
2558 | operands[0] is the destination | |
2559 | operands[1] is the source | |
2560 | operands[2] is the length | |
2561 | operands[3] is the alignment */ | |
2562 | ||
2563 | #define MAX_MOVE_REG 4 | |
2564 | ||
2565 | int | |
2566 | expand_block_move (rtx operands[]) | |
2567 | { | |
2568 | rtx orig_dest = operands[0]; | |
2569 | rtx orig_src = operands[1]; | |
2570 | rtx bytes_rtx = operands[2]; | |
2571 | rtx align_rtx = operands[3]; | |
2572 | int constp = (GET_CODE (bytes_rtx) == CONST_INT); | |
2573 | int align; | |
2574 | int bytes; | |
2575 | int offset; | |
2576 | int move_bytes; | |
2577 | rtx stores[MAX_MOVE_REG]; | |
2578 | int num_reg = 0; | |
2579 | ||
2580 | /* If this is not a fixed size move, just call memcpy */ | |
2581 | if (! constp) | |
2582 | return 0; | |
2583 | ||
2584 | /* This must be a fixed size alignment */ | |
2585 | gcc_assert (GET_CODE (align_rtx) == CONST_INT); | |
2586 | align = INTVAL (align_rtx) * BITS_PER_UNIT; | |
2587 | ||
2588 | /* Anything to move? */ | |
2589 | bytes = INTVAL (bytes_rtx); | |
2590 | if (bytes <= 0) | |
2591 | return 1; | |
2592 | ||
2593 | if (bytes > rs6000_block_move_inline_limit) | |
2594 | return 0; | |
2595 | ||
2596 | for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes) | |
2597 | { | |
2598 | union { | |
2599 | rtx (*movmemsi) (rtx, rtx, rtx, rtx); | |
2600 | rtx (*mov) (rtx, rtx); | |
2601 | } gen_func; | |
2602 | machine_mode mode = BLKmode; | |
2603 | rtx src, dest; | |
2604 | ||
2605 | /* Altivec first, since it will be faster than a string move | |
2606 | when it applies, and usually not significantly larger. */ | |
3b0cb1a5 | 2607 | if (TARGET_ALTIVEC && bytes >= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX || align >= 128)) |
8845cb37 AS |
2608 | { |
2609 | move_bytes = 16; | |
2610 | mode = V4SImode; | |
2611 | gen_func.mov = gen_movv4si; | |
2612 | } | |
8845cb37 AS |
2613 | else if (bytes >= 8 && TARGET_POWERPC64 |
2614 | && (align >= 64 || !STRICT_ALIGNMENT)) | |
2615 | { | |
2616 | move_bytes = 8; | |
2617 | mode = DImode; | |
2618 | gen_func.mov = gen_movdi; | |
2619 | if (offset == 0 && align < 64) | |
2620 | { | |
2621 | rtx addr; | |
2622 | ||
2623 | /* If the address form is reg+offset with offset not a | |
2624 | multiple of four, reload into reg indirect form here | |
2625 | rather than waiting for reload. This way we get one | |
2626 | reload, not one per load and/or store. */ | |
2627 | addr = XEXP (orig_dest, 0); | |
2628 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
2629 | && GET_CODE (XEXP (addr, 1)) == CONST_INT | |
2630 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) | |
2631 | { | |
2632 | addr = copy_addr_to_reg (addr); | |
2633 | orig_dest = replace_equiv_address (orig_dest, addr); | |
2634 | } | |
2635 | addr = XEXP (orig_src, 0); | |
2636 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
2637 | && GET_CODE (XEXP (addr, 1)) == CONST_INT | |
2638 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) | |
2639 | { | |
2640 | addr = copy_addr_to_reg (addr); | |
2641 | orig_src = replace_equiv_address (orig_src, addr); | |
2642 | } | |
2643 | } | |
2644 | } | |
8845cb37 AS |
2645 | else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) |
2646 | { /* move 4 bytes */ | |
2647 | move_bytes = 4; | |
2648 | mode = SImode; | |
2649 | gen_func.mov = gen_movsi; | |
2650 | } | |
2651 | else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) | |
2652 | { /* move 2 bytes */ | |
2653 | move_bytes = 2; | |
2654 | mode = HImode; | |
2655 | gen_func.mov = gen_movhi; | |
2656 | } | |
8845cb37 AS |
2657 | else /* move 1 byte at a time */ |
2658 | { | |
2659 | move_bytes = 1; | |
2660 | mode = QImode; | |
2661 | gen_func.mov = gen_movqi; | |
2662 | } | |
2663 | ||
2664 | src = adjust_address (orig_src, mode, offset); | |
2665 | dest = adjust_address (orig_dest, mode, offset); | |
2666 | ||
2667 | if (mode != BLKmode) | |
2668 | { | |
2669 | rtx tmp_reg = gen_reg_rtx (mode); | |
2670 | ||
2671 | emit_insn ((*gen_func.mov) (tmp_reg, src)); | |
2672 | stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); | |
2673 | } | |
2674 | ||
2675 | if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes) | |
2676 | { | |
2677 | int i; | |
2678 | for (i = 0; i < num_reg; i++) | |
2679 | emit_insn (stores[i]); | |
2680 | num_reg = 0; | |
2681 | } | |
2682 | ||
2683 | if (mode == BLKmode) | |
2684 | { | |
2685 | /* Move the address into scratch registers. The movmemsi | |
2686 | patterns require zero offset. */ | |
2687 | if (!REG_P (XEXP (src, 0))) | |
2688 | { | |
2689 | rtx src_reg = copy_addr_to_reg (XEXP (src, 0)); | |
2690 | src = replace_equiv_address (src, src_reg); | |
2691 | } | |
2692 | set_mem_size (src, move_bytes); | |
2693 | ||
2694 | if (!REG_P (XEXP (dest, 0))) | |
2695 | { | |
2696 | rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0)); | |
2697 | dest = replace_equiv_address (dest, dest_reg); | |
2698 | } | |
2699 | set_mem_size (dest, move_bytes); | |
2700 | ||
2701 | emit_insn ((*gen_func.movmemsi) (dest, src, | |
2702 | GEN_INT (move_bytes & 31), | |
2703 | align_rtx)); | |
2704 | } | |
2705 | } | |
2706 | ||
2707 | return 1; | |
2708 | } |