]>
Commit | Line | Data |
---|---|---|
8845cb37 AS |
1 | /* Subroutines used to expand string and block move, clear, |
2 | compare and other operations for PowerPC. | |
85ec4feb | 3 | Copyright (C) 1991-2018 Free Software Foundation, Inc. |
8845cb37 AS |
4 | |
5 | This file is part of GCC. | |
6 | ||
7 | GCC is free software; you can redistribute it and/or modify it | |
8 | under the terms of the GNU General Public License as published | |
9 | by the Free Software Foundation; either version 3, or (at your | |
10 | option) any later version. | |
11 | ||
12 | GCC is distributed in the hope that it will be useful, but WITHOUT | |
13 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
14 | or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | |
15 | License for more details. | |
16 | ||
17 | You should have received a copy of the GNU General Public License | |
18 | along with GCC; see the file COPYING3. If not see | |
19 | <http://www.gnu.org/licenses/>. */ | |
20 | ||
8fcc61f8 RS |
21 | #define IN_TARGET_CODE 1 |
22 | ||
8845cb37 AS |
23 | #include "config.h" |
24 | #include "system.h" | |
25 | #include "coretypes.h" | |
26 | #include "backend.h" | |
27 | #include "rtl.h" | |
28 | #include "tree.h" | |
29 | #include "memmodel.h" | |
30 | #include "tm_p.h" | |
31 | #include "ira.h" | |
32 | #include "print-tree.h" | |
33 | #include "varasm.h" | |
34 | #include "explow.h" | |
35 | #include "expr.h" | |
36 | #include "output.h" | |
e0bd6c9f | 37 | #include "target.h" |
8845cb37 AS |
38 | |
39 | /* Expand a block clear operation, and return 1 if successful. Return 0 | |
40 | if we should let the compiler generate normal code. | |
41 | ||
42 | operands[0] is the destination | |
43 | operands[1] is the length | |
44 | operands[3] is the alignment */ | |
45 | ||
46 | int | |
47 | expand_block_clear (rtx operands[]) | |
48 | { | |
49 | rtx orig_dest = operands[0]; | |
50 | rtx bytes_rtx = operands[1]; | |
51 | rtx align_rtx = operands[3]; | |
52 | bool constp = (GET_CODE (bytes_rtx) == CONST_INT); | |
53 | HOST_WIDE_INT align; | |
54 | HOST_WIDE_INT bytes; | |
55 | int offset; | |
56 | int clear_bytes; | |
57 | int clear_step; | |
58 | ||
59 | /* If this is not a fixed size move, just call memcpy */ | |
60 | if (! constp) | |
61 | return 0; | |
62 | ||
63 | /* This must be a fixed size alignment */ | |
64 | gcc_assert (GET_CODE (align_rtx) == CONST_INT); | |
65 | align = INTVAL (align_rtx) * BITS_PER_UNIT; | |
66 | ||
67 | /* Anything to clear? */ | |
68 | bytes = INTVAL (bytes_rtx); | |
69 | if (bytes <= 0) | |
70 | return 1; | |
71 | ||
72 | /* Use the builtin memset after a point, to avoid huge code bloat. | |
73 | When optimize_size, avoid any significant code bloat; calling | |
74 | memset is about 4 instructions, so allow for one instruction to | |
75 | load zero and three to do clearing. */ | |
3b0cb1a5 | 76 | if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX)) |
8845cb37 AS |
77 | clear_step = 16; |
78 | else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT)) | |
79 | clear_step = 8; | |
80 | else | |
81 | clear_step = 4; | |
82 | ||
83 | if (optimize_size && bytes > 3 * clear_step) | |
84 | return 0; | |
85 | if (! optimize_size && bytes > 8 * clear_step) | |
86 | return 0; | |
87 | ||
88 | for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes) | |
89 | { | |
90 | machine_mode mode = BLKmode; | |
91 | rtx dest; | |
92 | ||
3b0cb1a5 | 93 | if (bytes >= 16 && TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX)) |
8845cb37 AS |
94 | { |
95 | clear_bytes = 16; | |
96 | mode = V4SImode; | |
97 | } | |
98 | else if (bytes >= 8 && TARGET_POWERPC64 | |
99 | && (align >= 64 || !STRICT_ALIGNMENT)) | |
100 | { | |
101 | clear_bytes = 8; | |
102 | mode = DImode; | |
103 | if (offset == 0 && align < 64) | |
104 | { | |
105 | rtx addr; | |
106 | ||
107 | /* If the address form is reg+offset with offset not a | |
108 | multiple of four, reload into reg indirect form here | |
109 | rather than waiting for reload. This way we get one | |
110 | reload, not one per store. */ | |
111 | addr = XEXP (orig_dest, 0); | |
112 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
113 | && GET_CODE (XEXP (addr, 1)) == CONST_INT | |
114 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) | |
115 | { | |
116 | addr = copy_addr_to_reg (addr); | |
117 | orig_dest = replace_equiv_address (orig_dest, addr); | |
118 | } | |
119 | } | |
120 | } | |
121 | else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) | |
122 | { /* move 4 bytes */ | |
123 | clear_bytes = 4; | |
124 | mode = SImode; | |
125 | } | |
126 | else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) | |
127 | { /* move 2 bytes */ | |
128 | clear_bytes = 2; | |
129 | mode = HImode; | |
130 | } | |
131 | else /* move 1 byte at a time */ | |
132 | { | |
133 | clear_bytes = 1; | |
134 | mode = QImode; | |
135 | } | |
136 | ||
137 | dest = adjust_address (orig_dest, mode, offset); | |
138 | ||
139 | emit_move_insn (dest, CONST0_RTX (mode)); | |
140 | } | |
141 | ||
142 | return 1; | |
143 | } | |
144 | ||
145 | /* Figure out the correct instructions to generate to load data for | |
146 | block compare. MODE is used for the read from memory, and | |
147 | data is zero extended if REG is wider than MODE. If LE code | |
148 | is being generated, bswap loads are used. | |
149 | ||
150 | REG is the destination register to move the data into. | |
151 | MEM is the memory block being read. | |
152 | MODE is the mode of memory to use for the read. */ | |
153 | static void | |
154 | do_load_for_compare (rtx reg, rtx mem, machine_mode mode) | |
155 | { | |
156 | switch (GET_MODE (reg)) | |
157 | { | |
4e10a5a7 | 158 | case E_DImode: |
8845cb37 AS |
159 | switch (mode) |
160 | { | |
4e10a5a7 | 161 | case E_QImode: |
8845cb37 AS |
162 | emit_insn (gen_zero_extendqidi2 (reg, mem)); |
163 | break; | |
4e10a5a7 | 164 | case E_HImode: |
8845cb37 AS |
165 | { |
166 | rtx src = mem; | |
167 | if (!BYTES_BIG_ENDIAN) | |
168 | { | |
169 | src = gen_reg_rtx (HImode); | |
170 | emit_insn (gen_bswaphi2 (src, mem)); | |
171 | } | |
172 | emit_insn (gen_zero_extendhidi2 (reg, src)); | |
173 | break; | |
174 | } | |
4e10a5a7 | 175 | case E_SImode: |
8845cb37 AS |
176 | { |
177 | rtx src = mem; | |
178 | if (!BYTES_BIG_ENDIAN) | |
179 | { | |
180 | src = gen_reg_rtx (SImode); | |
181 | emit_insn (gen_bswapsi2 (src, mem)); | |
182 | } | |
183 | emit_insn (gen_zero_extendsidi2 (reg, src)); | |
184 | } | |
185 | break; | |
4e10a5a7 | 186 | case E_DImode: |
8845cb37 AS |
187 | if (!BYTES_BIG_ENDIAN) |
188 | emit_insn (gen_bswapdi2 (reg, mem)); | |
189 | else | |
190 | emit_insn (gen_movdi (reg, mem)); | |
191 | break; | |
192 | default: | |
193 | gcc_unreachable (); | |
194 | } | |
195 | break; | |
196 | ||
4e10a5a7 | 197 | case E_SImode: |
8845cb37 AS |
198 | switch (mode) |
199 | { | |
4e10a5a7 | 200 | case E_QImode: |
8845cb37 AS |
201 | emit_insn (gen_zero_extendqisi2 (reg, mem)); |
202 | break; | |
4e10a5a7 | 203 | case E_HImode: |
8845cb37 AS |
204 | { |
205 | rtx src = mem; | |
206 | if (!BYTES_BIG_ENDIAN) | |
207 | { | |
208 | src = gen_reg_rtx (HImode); | |
209 | emit_insn (gen_bswaphi2 (src, mem)); | |
210 | } | |
211 | emit_insn (gen_zero_extendhisi2 (reg, src)); | |
212 | break; | |
213 | } | |
4e10a5a7 | 214 | case E_SImode: |
8845cb37 AS |
215 | if (!BYTES_BIG_ENDIAN) |
216 | emit_insn (gen_bswapsi2 (reg, mem)); | |
217 | else | |
218 | emit_insn (gen_movsi (reg, mem)); | |
219 | break; | |
4e10a5a7 | 220 | case E_DImode: |
8845cb37 AS |
221 | /* DImode is larger than the destination reg so is not expected. */ |
222 | gcc_unreachable (); | |
223 | break; | |
224 | default: | |
225 | gcc_unreachable (); | |
226 | } | |
227 | break; | |
228 | default: | |
229 | gcc_unreachable (); | |
230 | break; | |
231 | } | |
232 | } | |
233 | ||
234 | /* Select the mode to be used for reading the next chunk of bytes | |
235 | in the compare. | |
236 | ||
237 | OFFSET is the current read offset from the beginning of the block. | |
238 | BYTES is the number of bytes remaining to be read. | |
239 | ALIGN is the minimum alignment of the memory blocks being compared in bytes. | |
240 | WORD_MODE_OK indicates using WORD_MODE is allowed, else SImode is | |
241 | the largest allowable mode. */ | |
242 | static machine_mode | |
243 | select_block_compare_mode (unsigned HOST_WIDE_INT offset, | |
244 | unsigned HOST_WIDE_INT bytes, | |
245 | unsigned HOST_WIDE_INT align, bool word_mode_ok) | |
246 | { | |
247 | /* First see if we can do a whole load unit | |
248 | as that will be more efficient than a larger load + shift. */ | |
249 | ||
250 | /* If big, use biggest chunk. | |
251 | If exactly chunk size, use that size. | |
252 | If remainder can be done in one piece with shifting, do that. | |
253 | Do largest chunk possible without violating alignment rules. */ | |
254 | ||
255 | /* The most we can read without potential page crossing. */ | |
256 | unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align); | |
257 | ||
258 | if (word_mode_ok && bytes >= UNITS_PER_WORD) | |
259 | return word_mode; | |
260 | else if (bytes == GET_MODE_SIZE (SImode)) | |
261 | return SImode; | |
262 | else if (bytes == GET_MODE_SIZE (HImode)) | |
263 | return HImode; | |
264 | else if (bytes == GET_MODE_SIZE (QImode)) | |
265 | return QImode; | |
266 | else if (bytes < GET_MODE_SIZE (SImode) | |
267 | && offset >= GET_MODE_SIZE (SImode) - bytes) | |
268 | /* This matches the case were we have SImode and 3 bytes | |
269 | and offset >= 1 and permits us to move back one and overlap | |
270 | with the previous read, thus avoiding having to shift | |
271 | unwanted bytes off of the input. */ | |
272 | return SImode; | |
273 | else if (word_mode_ok && bytes < UNITS_PER_WORD | |
274 | && offset >= UNITS_PER_WORD-bytes) | |
275 | /* Similarly, if we can use DImode it will get matched here and | |
276 | can do an overlapping read that ends at the end of the block. */ | |
277 | return word_mode; | |
278 | else if (word_mode_ok && maxread >= UNITS_PER_WORD) | |
279 | /* It is safe to do all remaining in one load of largest size, | |
280 | possibly with a shift to get rid of unwanted bytes. */ | |
281 | return word_mode; | |
282 | else if (maxread >= GET_MODE_SIZE (SImode)) | |
283 | /* It is safe to do all remaining in one SImode load, | |
284 | possibly with a shift to get rid of unwanted bytes. */ | |
285 | return SImode; | |
286 | else if (bytes > GET_MODE_SIZE (SImode)) | |
287 | return SImode; | |
288 | else if (bytes > GET_MODE_SIZE (HImode)) | |
289 | return HImode; | |
290 | ||
291 | /* final fallback is do one byte */ | |
292 | return QImode; | |
293 | } | |
294 | ||
295 | /* Compute the alignment of pointer+OFFSET where the original alignment | |
296 | of pointer was BASE_ALIGN. */ | |
297 | static unsigned HOST_WIDE_INT | |
298 | compute_current_alignment (unsigned HOST_WIDE_INT base_align, | |
299 | unsigned HOST_WIDE_INT offset) | |
300 | { | |
301 | if (offset == 0) | |
302 | return base_align; | |
303 | return MIN (base_align, offset & -offset); | |
304 | } | |
305 | ||
5ec3397e AS |
306 | /* Prepare address and then do a load. |
307 | ||
308 | MODE is the mode to use for the load. | |
309 | DEST is the destination register for the data. | |
310 | ADDR is the address to be loaded. | |
311 | ORIG_ADDR is the original address expression. */ | |
312 | static void | |
313 | do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr, | |
314 | rtx orig_addr) | |
315 | { | |
316 | rtx mem = gen_rtx_MEM (mode, addr); | |
317 | MEM_COPY_ATTRIBUTES (mem, orig_addr); | |
318 | set_mem_size (mem, GET_MODE_SIZE (mode)); | |
319 | do_load_for_compare (dest, mem, mode); | |
320 | return; | |
321 | } | |
322 | ||
323 | /* Do a branch for an if/else decision. | |
324 | ||
325 | CMPMODE is the mode to use for the comparison. | |
326 | COMPARISON is the rtx code for the compare needed. | |
327 | A is the first thing to be compared. | |
328 | B is the second thing to be compared. | |
329 | CR is the condition code reg input, or NULL_RTX. | |
330 | TRUE_LABEL is the label to branch to if the condition is true. | |
331 | ||
332 | The return value is the CR used for the comparison. | |
333 | If CR is null_rtx, then a new register of CMPMODE is generated. | |
334 | If A and B are both null_rtx, then CR must not be null, and the | |
335 | compare is not generated so you can use this with a dot form insn. */ | |
336 | ||
337 | static void | |
338 | do_ifelse (machine_mode cmpmode, rtx_code comparison, | |
339 | rtx a, rtx b, rtx cr, rtx true_label) | |
340 | { | |
341 | gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX) | |
342 | || (a != NULL_RTX && b != NULL_RTX)); | |
343 | ||
344 | if (cr != NULL_RTX) | |
345 | gcc_assert (GET_MODE (cr) == cmpmode); | |
346 | else | |
347 | cr = gen_reg_rtx (cmpmode); | |
348 | ||
349 | rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label); | |
350 | ||
351 | if (a != NULL_RTX) | |
352 | emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b)); | |
353 | ||
354 | rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx); | |
355 | ||
356 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx); | |
357 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
358 | JUMP_LABEL (j) = true_label; | |
359 | LABEL_NUSES (true_label) += 1; | |
360 | } | |
361 | ||
362 | /* Emit an isel of the proper mode for DEST. | |
363 | ||
364 | DEST is the isel destination register. | |
365 | SRC1 is the isel source if CR is true. | |
366 | SRC2 is the isel source if CR is false. | |
367 | CR is the condition for the isel. */ | |
368 | static void | |
369 | do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr) | |
370 | { | |
371 | if (GET_MODE (dest) == DImode) | |
372 | emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr)); | |
373 | else | |
374 | emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr)); | |
375 | } | |
376 | ||
377 | /* Emit a subtract of the proper mode for DEST. | |
378 | ||
379 | DEST is the destination register for the subtract. | |
380 | SRC1 is the first subtract input. | |
381 | SRC2 is the second subtract input. | |
382 | ||
383 | Computes DEST = SRC1-SRC2. */ | |
384 | static void | |
385 | do_sub3 (rtx dest, rtx src1, rtx src2) | |
386 | { | |
387 | if (GET_MODE (dest) == DImode) | |
388 | emit_insn (gen_subdi3 (dest, src1, src2)); | |
389 | else | |
390 | emit_insn (gen_subsi3 (dest, src1, src2)); | |
391 | } | |
392 | ||
393 | /* Emit an add of the proper mode for DEST. | |
394 | ||
395 | DEST is the destination register for the add. | |
396 | SRC1 is the first add input. | |
397 | SRC2 is the second add input. | |
398 | ||
399 | Computes DEST = SRC1+SRC2. */ | |
400 | static void | |
401 | do_add3 (rtx dest, rtx src1, rtx src2) | |
402 | { | |
403 | if (GET_MODE (dest) == DImode) | |
404 | emit_insn (gen_adddi3 (dest, src1, src2)); | |
405 | else | |
406 | emit_insn (gen_addsi3 (dest, src1, src2)); | |
407 | } | |
408 | ||
409 | /* Generate rtl for a load, shift, and compare of less than a full word. | |
410 | ||
411 | LOAD_MODE is the machine mode for the loads. | |
412 | DIFF is the reg for the difference. | |
413 | CMP_REM is the reg containing the remaining bytes to compare. | |
414 | DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. | |
415 | SRC1_ADDR is the first source address. | |
416 | SRC2_ADDR is the second source address. | |
417 | ORIG_SRC1 is the original first source block's address rtx. | |
418 | ORIG_SRC2 is the original second source block's address rtx. */ | |
419 | static void | |
420 | do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond, | |
421 | rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2) | |
422 | { | |
423 | HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); | |
424 | rtx shift_amount = gen_reg_rtx (word_mode); | |
425 | rtx d1 = gen_reg_rtx (word_mode); | |
426 | rtx d2 = gen_reg_rtx (word_mode); | |
427 | ||
428 | do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1); | |
429 | do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2); | |
430 | do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem); | |
431 | ||
432 | if (word_mode == DImode) | |
433 | { | |
434 | emit_insn (gen_ashldi3 (shift_amount, shift_amount, | |
435 | GEN_INT (LOG2_BITS_PER_UNIT))); | |
436 | emit_insn (gen_lshrdi3 (d1, d1, | |
437 | gen_lowpart (SImode, shift_amount))); | |
438 | emit_insn (gen_lshrdi3 (d2, d2, | |
439 | gen_lowpart (SImode, shift_amount))); | |
440 | } | |
441 | else | |
442 | { | |
443 | emit_insn (gen_ashlsi3 (shift_amount, shift_amount, | |
444 | GEN_INT (LOG2_BITS_PER_UNIT))); | |
445 | emit_insn (gen_lshrsi3 (d1, d1, shift_amount)); | |
446 | emit_insn (gen_lshrsi3 (d2, d2, shift_amount)); | |
447 | } | |
448 | ||
449 | if (TARGET_P9_MISC) | |
450 | { | |
451 | /* Generate a compare, and convert with a setb later. */ | |
452 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); | |
453 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
454 | } | |
455 | else | |
456 | { | |
457 | if (word_mode == DImode) | |
458 | emit_insn (gen_subfdi3_carry (diff, d2, d1)); | |
459 | else | |
460 | emit_insn (gen_subfsi3_carry (diff, d2, d1)); | |
461 | } | |
462 | } | |
463 | ||
464 | /* Generate rtl for an overlapping load and compare of less than a | |
465 | full load_mode. This assumes that the previous word is part of the | |
466 | block being compared so it's ok to back up part of a word so we can | |
467 | compare the last unaligned full word that ends at the end of the block. | |
468 | ||
469 | LOAD_MODE is the machine mode for the loads. | |
470 | ISCONST tells whether the remaining length is a constant or in a register. | |
471 | BYTES_REM is the remaining length if ISCONST is true. | |
472 | DIFF is the reg for the difference. | |
473 | CMP_REM is the reg containing the remaining bytes to compare if !ISCONST. | |
474 | DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. | |
475 | SRC1_ADDR is the first source address. | |
476 | SRC2_ADDR is the second source address. | |
477 | ORIG_SRC1 is the original first source block's address rtx. | |
478 | ORIG_SRC2 is the original second source block's address rtx. */ | |
479 | static void | |
480 | do_overlap_load_compare (machine_mode load_mode, bool isConst, | |
481 | HOST_WIDE_INT bytes_rem, rtx diff, | |
482 | rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr, | |
483 | rtx orig_src1, rtx orig_src2) | |
484 | { | |
485 | HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); | |
486 | HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem; | |
487 | rtx d1 = gen_reg_rtx (word_mode); | |
488 | rtx d2 = gen_reg_rtx (word_mode); | |
489 | ||
490 | rtx addr1, addr2; | |
491 | if (!isConst || addr_adj) | |
492 | { | |
493 | rtx adj_reg = gen_reg_rtx (word_mode); | |
494 | if (isConst) | |
495 | emit_move_insn (adj_reg, GEN_INT (-addr_adj)); | |
496 | else | |
497 | { | |
498 | rtx reg_lms = gen_reg_rtx (word_mode); | |
499 | emit_move_insn (reg_lms, GEN_INT (load_mode_size)); | |
500 | do_sub3 (adj_reg, cmp_rem, reg_lms); | |
501 | } | |
502 | ||
503 | addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg); | |
504 | addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg); | |
505 | } | |
506 | else | |
507 | { | |
508 | addr1 = src1_addr; | |
509 | addr2 = src2_addr; | |
510 | } | |
511 | ||
512 | do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1); | |
513 | do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2); | |
514 | ||
515 | if (TARGET_P9_MISC) | |
516 | { | |
517 | /* Generate a compare, and convert with a setb later. */ | |
518 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); | |
519 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
520 | } | |
521 | else | |
522 | { | |
523 | if (word_mode == DImode) | |
524 | emit_insn (gen_subfdi3_carry (diff, d2, d1)); | |
525 | else | |
526 | emit_insn (gen_subfsi3_carry (diff, d2, d1)); | |
527 | } | |
528 | } | |
529 | ||
530 | /* Expand a block compare operation using loop code, and return true | |
531 | if successful. Return false if we should let the compiler generate | |
532 | normal code, probably a memcmp call. | |
533 | ||
534 | OPERANDS[0] is the target (result). | |
535 | OPERANDS[1] is the first source. | |
536 | OPERANDS[2] is the second source. | |
537 | OPERANDS[3] is the length. | |
538 | OPERANDS[4] is the alignment. */ | |
539 | bool | |
540 | expand_compare_loop (rtx operands[]) | |
541 | { | |
542 | rtx target = operands[0]; | |
543 | rtx orig_src1 = operands[1]; | |
544 | rtx orig_src2 = operands[2]; | |
545 | rtx bytes_rtx = operands[3]; | |
546 | rtx align_rtx = operands[4]; | |
547 | ||
548 | /* This case is complicated to handle because the subtract | |
549 | with carry instructions do not generate the 64-bit | |
550 | carry and so we must emit code to calculate it ourselves. | |
551 | We choose not to implement this yet. */ | |
552 | if (TARGET_32BIT && TARGET_POWERPC64) | |
553 | return false; | |
554 | ||
555 | /* Allow non-const length. */ | |
556 | int bytes_is_const = CONST_INT_P (bytes_rtx); | |
557 | ||
558 | /* This must be a fixed size alignment. */ | |
559 | if (!CONST_INT_P (align_rtx)) | |
560 | return false; | |
561 | ||
562 | HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; | |
563 | HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; | |
564 | HOST_WIDE_INT minalign = MIN (align1, align2); | |
565 | ||
566 | bool isP7 = (rs6000_tune == PROCESSOR_POWER7); | |
567 | ||
568 | gcc_assert (GET_MODE (target) == SImode); | |
569 | ||
570 | /* Anything to move? */ | |
571 | HOST_WIDE_INT bytes = 0; | |
572 | if (bytes_is_const) | |
573 | bytes = INTVAL (bytes_rtx); | |
574 | ||
575 | if (bytes_is_const && bytes == 0) | |
576 | return true; | |
577 | ||
578 | /* Limit the amount we compare, if known statically. */ | |
579 | HOST_WIDE_INT max_bytes; | |
580 | switch (rs6000_tune) | |
581 | { | |
582 | case PROCESSOR_POWER7: | |
583 | if (!bytes_is_const) | |
584 | if (minalign < 8) | |
585 | max_bytes = 0; | |
586 | else | |
587 | max_bytes = 128; | |
588 | else | |
589 | if (minalign < 8) | |
590 | max_bytes = 32; | |
591 | else | |
592 | max_bytes = 128; | |
593 | break; | |
594 | case PROCESSOR_POWER8: | |
595 | if (!bytes_is_const) | |
596 | max_bytes = 0; | |
597 | else | |
598 | if (minalign < 8) | |
599 | max_bytes = 128; | |
600 | else | |
601 | max_bytes = 64; | |
602 | break; | |
603 | case PROCESSOR_POWER9: | |
604 | if (bytes_is_const) | |
605 | max_bytes = 191; | |
606 | else | |
607 | max_bytes = 0; | |
608 | break; | |
609 | default: | |
610 | max_bytes = 128; | |
611 | } | |
612 | ||
613 | /* Allow the option to override the default. */ | |
614 | if (rs6000_block_compare_inline_loop_limit >= 0) | |
615 | max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit; | |
616 | ||
617 | if (max_bytes == 0) | |
618 | return false; | |
619 | ||
620 | rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */ | |
621 | rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */ | |
622 | HOST_WIDE_INT niter; | |
623 | rtx iter = gen_reg_rtx (word_mode); | |
624 | rtx iv1 = gen_reg_rtx (word_mode); | |
625 | rtx iv2 = gen_reg_rtx (word_mode); | |
626 | rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */ | |
627 | rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */ | |
628 | rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */ | |
629 | rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */ | |
630 | ||
631 | /* Strip unneeded subreg from length if there is one. */ | |
632 | if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx)) | |
633 | bytes_rtx = SUBREG_REG (bytes_rtx); | |
634 | /* Extend bytes_rtx to word_mode if needed. But, we expect only to | |
635 | maybe have to deal with the case were bytes_rtx is SImode and | |
636 | word_mode is DImode. */ | |
637 | if (!bytes_is_const) | |
638 | { | |
639 | if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode)) | |
640 | /* Do not expect length longer than word_mode. */ | |
641 | return false; | |
642 | else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode)) | |
643 | { | |
644 | bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); | |
645 | bytes_rtx = force_reg (word_mode, | |
646 | gen_rtx_fmt_e (ZERO_EXTEND, word_mode, | |
647 | bytes_rtx)); | |
648 | } | |
649 | else | |
650 | /* Make sure it's in a register before we get started. */ | |
651 | bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); | |
652 | } | |
653 | ||
654 | machine_mode load_mode = word_mode; | |
655 | HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); | |
656 | ||
657 | /* Number of bytes per iteration of the unrolled loop. */ | |
658 | HOST_WIDE_INT loop_bytes = 2 * load_mode_size; | |
659 | /* max iters and bytes compared in the loop. */ | |
660 | HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes; | |
661 | HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes; | |
662 | int l2lb = floor_log2 (loop_bytes); | |
663 | ||
664 | if (bytes_is_const && (max_bytes < load_mode_size | |
665 | || !IN_RANGE (bytes, load_mode_size, max_bytes))) | |
666 | return false; | |
667 | ||
668 | bool no_remainder_code = false; | |
669 | rtx final_label = gen_label_rtx (); | |
670 | rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
671 | rtx diff_label = gen_label_rtx (); | |
672 | rtx library_call_label = NULL; | |
673 | rtx cleanup_label = gen_label_rtx (); | |
674 | ||
675 | rtx cr; | |
676 | ||
677 | rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0)); | |
678 | rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0)); | |
679 | ||
680 | /* Difference found is stored here before jump to diff_label. */ | |
681 | rtx diff = gen_reg_rtx (word_mode); | |
682 | rtx j; | |
683 | ||
684 | /* Example of generated code for 35 bytes aligned 1 byte. | |
685 | ||
686 | mtctr 8 | |
687 | li 6,0 | |
688 | li 5,8 | |
689 | .L13: | |
690 | ldbrx 7,3,6 | |
691 | ldbrx 9,10,6 | |
692 | ldbrx 0,3,5 | |
693 | ldbrx 4,10,5 | |
694 | addi 6,6,16 | |
695 | addi 5,5,16 | |
696 | subfc. 9,9,7 | |
697 | bne 0,.L10 | |
698 | subfc. 9,4,0 | |
699 | bdnzt 2,.L13 | |
700 | bne 0,.L10 | |
701 | add 3,3,6 | |
702 | add 10,10,6 | |
703 | addi 9,3,-5 | |
704 | ldbrx 7,0,9 | |
705 | addi 9,10,-5 | |
706 | ldbrx 9,0,9 | |
707 | subfc 9,9,7 | |
708 | .p2align 4,,15 | |
709 | .L10: | |
710 | popcntd 9,9 | |
711 | subfe 10,10,10 | |
712 | or 9,9,10 | |
713 | ||
714 | Compiled with -fno-reorder-blocks for clarity. */ | |
715 | ||
716 | /* Structure of what we're going to do: | |
717 | Two separate lengths: what we will compare before bailing to library | |
718 | call (max_bytes), and the total length to be checked. | |
719 | if length <= 16, branch to linear cleanup code starting with | |
720 | remainder length check (length not known at compile time) | |
721 | set up 2 iv's and load count reg, compute remainder length | |
722 | unrollx2 compare loop | |
723 | if loop exit due to a difference, branch to difference handling code | |
724 | if remainder length < 8, branch to final cleanup compare | |
725 | load and compare 8B | |
726 | final cleanup comparison (depends on alignment and length) | |
727 | load 8B, shift off bytes past length, compare | |
728 | load 8B ending at last byte and compare | |
729 | load/compare 1 byte at a time (short block abutting 4k boundary) | |
730 | difference handling, 64->32 conversion | |
731 | final result | |
732 | branch around memcmp call | |
733 | memcmp library call | |
734 | */ | |
735 | ||
736 | /* If bytes is not const, compare length and branch directly | |
737 | to the cleanup code that can handle 0-16 bytes if length | |
738 | is >= 16. Stash away bytes-max_bytes for the library call. */ | |
739 | if (bytes_is_const) | |
740 | { | |
741 | /* These need to be set for some of the places we may jump to. */ | |
742 | if (bytes > max_bytes) | |
743 | { | |
744 | no_remainder_code = true; | |
745 | niter = max_loop_iter; | |
746 | library_call_label = gen_label_rtx (); | |
747 | } | |
748 | else | |
749 | { | |
750 | niter = bytes / loop_bytes; | |
751 | } | |
752 | emit_move_insn (iter, GEN_INT (niter)); | |
753 | emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes)); | |
754 | emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes)); | |
755 | } | |
756 | else | |
757 | { | |
758 | library_call_label = gen_label_rtx (); | |
759 | ||
760 | /* If we go to the cleanup code, it expects length to be in cmp_rem. */ | |
761 | emit_move_insn (cmp_rem, bytes_rtx); | |
762 | ||
763 | /* Check for > max_bytes bytes. We want to bail out as quickly as | |
764 | possible if we have to go over to memcmp. */ | |
765 | do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes), | |
766 | NULL_RTX, library_call_label); | |
767 | ||
768 | /* Check for < loop_bytes bytes. */ | |
769 | do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes), | |
770 | NULL_RTX, cleanup_label); | |
771 | ||
772 | /* Loop compare bytes and iterations if bytes>max_bytes. */ | |
773 | rtx mb_reg = gen_reg_rtx (word_mode); | |
774 | emit_move_insn (mb_reg, GEN_INT (max_loop_bytes)); | |
775 | rtx mi_reg = gen_reg_rtx (word_mode); | |
776 | emit_move_insn (mi_reg, GEN_INT (max_loop_iter)); | |
777 | ||
778 | /* Compute number of loop iterations if bytes <= max_bytes. */ | |
779 | if (word_mode == DImode) | |
780 | emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb))); | |
781 | else | |
782 | emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb))); | |
783 | ||
784 | /* Compute bytes to compare in loop if bytes <= max_bytes. */ | |
785 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb); | |
786 | if (word_mode == DImode) | |
787 | { | |
788 | emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask)); | |
789 | } | |
790 | else | |
791 | { | |
792 | emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask)); | |
793 | } | |
794 | ||
795 | /* Check for bytes <= max_bytes. */ | |
796 | if (TARGET_ISEL) | |
797 | { | |
798 | /* P9 has fast isel so we use one compare and two isel. */ | |
799 | cr = gen_reg_rtx (CCmode); | |
800 | rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx, | |
801 | GEN_INT (max_bytes)); | |
802 | emit_move_insn (cr, compare_rtx); | |
803 | rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx); | |
804 | do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr); | |
805 | do_isel (iter, cmp_rtx, iter, mi_reg, cr); | |
806 | } | |
807 | else | |
808 | { | |
809 | rtx lab_after = gen_label_rtx (); | |
810 | do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes), | |
811 | NULL_RTX, lab_after); | |
812 | emit_move_insn (loop_cmp, mb_reg); | |
813 | emit_move_insn (iter, mi_reg); | |
814 | emit_label (lab_after); | |
815 | } | |
816 | ||
817 | /* Now compute remainder bytes which isn't used until after the loop. */ | |
818 | do_sub3 (cmp_rem, bytes_rtx, loop_cmp); | |
819 | } | |
820 | ||
821 | rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */ | |
822 | /* For p9 we need to have just one of these as multiple places define | |
823 | it and it gets used by the setb at the end. */ | |
824 | if (TARGET_P9_MISC) | |
825 | dcond = gen_reg_rtx (CCUNSmode); | |
826 | ||
827 | if (!bytes_is_const || bytes >= loop_bytes) | |
828 | { | |
829 | /* It should not be possible to come here if remaining bytes is | |
830 | < 16 in the runtime case either. Compute number of loop | |
831 | iterations. We compare 2*word_mode per iteration so 16B for | |
832 | 64-bit code and 8B for 32-bit. Set up two induction | |
833 | variables and load count register. */ | |
834 | ||
835 | /* HACK ALERT: create hard reg for CTR here. If we just use a | |
836 | pseudo, cse will get rid of it and then the allocator will | |
837 | see it used in the lshr above and won't give us ctr. */ | |
838 | rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); | |
839 | emit_move_insn (ctr, iter); | |
840 | emit_move_insn (diff, GEN_INT (0)); | |
841 | emit_move_insn (iv1, GEN_INT (0)); | |
842 | emit_move_insn (iv2, GEN_INT (load_mode_size)); | |
843 | ||
844 | /* inner loop to compare 2*word_mode */ | |
845 | rtx loop_top_label = gen_label_rtx (); | |
846 | emit_label (loop_top_label); | |
847 | ||
848 | rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1); | |
849 | rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1); | |
850 | ||
851 | do_load_for_compare_from_addr (load_mode, d1_1, | |
852 | src1_ix1, orig_src1); | |
853 | do_load_for_compare_from_addr (load_mode, d2_1, | |
854 | src2_ix1, orig_src2); | |
855 | do_add3 (iv1, iv1, GEN_INT (loop_bytes)); | |
856 | ||
857 | rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2); | |
858 | rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2); | |
859 | ||
860 | do_load_for_compare_from_addr (load_mode, d1_2, | |
861 | src1_ix2, orig_src1); | |
862 | do_load_for_compare_from_addr (load_mode, d2_2, | |
863 | src2_ix2, orig_src2); | |
864 | do_add3 (iv2, iv2, GEN_INT (loop_bytes)); | |
865 | ||
866 | if (TARGET_P9_MISC) | |
867 | { | |
868 | /* Generate a compare, and convert with a setb later. */ | |
869 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); | |
870 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
871 | } | |
872 | else | |
873 | { | |
874 | dcond = gen_reg_rtx (CCmode); | |
875 | if (word_mode == DImode) | |
876 | emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
877 | else | |
878 | emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
879 | } | |
880 | ||
881 | do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, | |
882 | dcond, diff_label); | |
883 | ||
884 | if (TARGET_P9_MISC) | |
885 | { | |
886 | /* Generate a compare, and convert with a setb later. */ | |
887 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2); | |
888 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
889 | } | |
890 | else | |
891 | { | |
892 | dcond = gen_reg_rtx (CCmode); | |
893 | if (word_mode == DImode) | |
894 | emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond)); | |
895 | else | |
896 | emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond)); | |
897 | } | |
898 | ||
899 | rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2); | |
900 | if (TARGET_64BIT) | |
901 | j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr, | |
902 | eqrtx, dcond)); | |
903 | else | |
904 | j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr, | |
905 | eqrtx, dcond)); | |
906 | JUMP_LABEL (j) = loop_top_label; | |
907 | LABEL_NUSES (loop_top_label) += 1; | |
908 | } | |
909 | ||
910 | HOST_WIDE_INT bytes_remaining = 0; | |
911 | if (bytes_is_const) | |
912 | bytes_remaining = (bytes % loop_bytes); | |
913 | ||
914 | /* If diff is nonzero, branch to difference handling | |
915 | code. If we exit here with a nonzero diff, it is | |
916 | because the second word differed. */ | |
917 | if (TARGET_P9_MISC) | |
918 | do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, diff_label); | |
919 | else | |
920 | do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, diff_label); | |
921 | ||
922 | if (library_call_label != NULL && bytes_is_const && bytes > max_bytes) | |
923 | { | |
924 | /* If the length is known at compile time, then we will always | |
925 | have a remainder to go to the library call with. */ | |
926 | rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label); | |
927 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref)); | |
928 | JUMP_LABEL (j) = library_call_label; | |
929 | LABEL_NUSES (library_call_label) += 1; | |
930 | emit_barrier (); | |
931 | } | |
932 | ||
933 | if (bytes_is_const && bytes_remaining == 0) | |
934 | { | |
935 | /* No remainder and if we are here then diff is 0 so just return 0 */ | |
936 | if (TARGET_64BIT) | |
937 | emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); | |
938 | else | |
939 | emit_move_insn (target, diff); | |
940 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); | |
941 | JUMP_LABEL (j) = final_label; | |
942 | LABEL_NUSES (final_label) += 1; | |
943 | emit_barrier (); | |
944 | } | |
945 | else if (!no_remainder_code) | |
946 | { | |
947 | /* Update addresses to point to the next word to examine. */ | |
948 | do_add3 (src1_addr, src1_addr, iv1); | |
949 | do_add3 (src2_addr, src2_addr, iv1); | |
950 | ||
951 | emit_label (cleanup_label); | |
952 | ||
953 | if (!bytes_is_const) | |
954 | { | |
955 | /* If we're dealing with runtime length, we have to check if | |
956 | it's zero after the loop. When length is known at compile | |
957 | time the no-remainder condition is dealt with above. By | |
958 | doing this after cleanup_label, we also deal with the | |
959 | case where length is 0 at the start and we bypass the | |
960 | loop with a branch to cleanup_label. */ | |
961 | emit_move_insn (target, const0_rtx); | |
962 | do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, | |
963 | NULL_RTX, final_label); | |
964 | } | |
965 | ||
966 | rtx final_cleanup = gen_label_rtx (); | |
967 | rtx cmp_rem_before = gen_reg_rtx (word_mode); | |
968 | /* Compare one more word_mode chunk if needed. */ | |
969 | if (!bytes_is_const | |
970 | || (bytes_is_const && bytes_remaining >= load_mode_size)) | |
971 | { | |
972 | /* If remainder length < word length, branch to final | |
973 | cleanup compare. */ | |
974 | if (!bytes_is_const) | |
975 | do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size), | |
976 | NULL_RTX, final_cleanup); | |
977 | ||
978 | /* load and compare 8B */ | |
979 | do_load_for_compare_from_addr (load_mode, d1_1, | |
980 | src1_addr, orig_src1); | |
981 | do_load_for_compare_from_addr (load_mode, d2_1, | |
982 | src2_addr, orig_src2); | |
983 | ||
984 | /* Compare the word, see if we need to do the last partial. */ | |
985 | if (TARGET_P9_MISC) | |
986 | { | |
987 | /* Generate a compare, and convert with a setb later. */ | |
988 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); | |
989 | emit_insn (gen_rtx_SET (dcond, cmp)); | |
990 | } | |
991 | else | |
992 | { | |
993 | dcond = gen_reg_rtx (CCmode); | |
994 | if (word_mode == DImode) | |
995 | emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
996 | else | |
997 | emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); | |
998 | } | |
999 | ||
1000 | do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, | |
1001 | dcond, diff_label); | |
1002 | ||
1003 | do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size)); | |
1004 | do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size)); | |
1005 | emit_move_insn (cmp_rem_before, cmp_rem); | |
1006 | do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size)); | |
1007 | if (bytes_is_const) | |
1008 | bytes_remaining -= load_mode_size; | |
1009 | else | |
1010 | /* See if remaining length is now zero. We previously set | |
1011 | target to 0 so we can just jump to the end. */ | |
1012 | do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, | |
1013 | NULL_RTX, final_label); | |
1014 | ||
1015 | } | |
1016 | ||
1017 | /* Cases: | |
1018 | bytes_is_const | |
1019 | We can always shift back to do an overlapping compare | |
1020 | of the last chunk because we know length >= 8. | |
1021 | ||
1022 | !bytes_is_const | |
1023 | align>=load_mode_size | |
1024 | Read word_mode and mask | |
1025 | align<load_mode_size | |
1026 | avoid stepping past end | |
1027 | ||
1028 | Three strategies: | |
1029 | * decrement address and do overlapping compare | |
1030 | * read word_mode and mask | |
1031 | * carefully avoid crossing 4k boundary | |
1032 | */ | |
1033 | ||
1034 | if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7)) | |
1035 | && align1 >= load_mode_size && align2 >= load_mode_size) | |
1036 | { | |
1037 | /* Alignment is larger than word_mode so we do not need to be | |
1038 | concerned with extra page crossings. But, we do not know | |
1039 | that the length is larger than load_mode_size so we might | |
1040 | end up compareing against data before the block if we try | |
1041 | an overlapping compare. Also we use this on P7 for fixed length | |
1042 | remainder because P7 doesn't like overlapping unaligned. | |
1043 | Strategy: load 8B, shift off bytes past length, and compare. */ | |
1044 | emit_label (final_cleanup); | |
1045 | do_load_mask_compare (load_mode, diff, cmp_rem, dcond, | |
1046 | src1_addr, src2_addr, orig_src1, orig_src2); | |
1047 | } | |
1048 | else if (bytes_remaining && bytes_is_const) | |
1049 | { | |
1050 | /* We do not do loop expand if length < 32 so we know at the | |
1051 | end we can do an overlapping compare. | |
1052 | Strategy: shift address back and do word_mode load that | |
1053 | ends at the end of the block. */ | |
1054 | emit_label (final_cleanup); | |
1055 | do_overlap_load_compare (load_mode, true, bytes_remaining, diff, | |
1056 | cmp_rem, dcond, src1_addr, src2_addr, | |
1057 | orig_src1, orig_src2); | |
1058 | } | |
1059 | else if (!bytes_is_const) | |
1060 | { | |
1061 | rtx handle4k_label = gen_label_rtx (); | |
1062 | rtx nonconst_overlap = gen_label_rtx (); | |
1063 | emit_label (nonconst_overlap); | |
1064 | ||
1065 | /* Here we have to handle the case where whe have runtime | |
1066 | length which may be too short for overlap compare, and | |
1067 | alignment is not at least load_mode_size so we have to | |
1068 | tread carefully to avoid stepping across 4k boundaries. */ | |
1069 | ||
1070 | /* If the length after the loop was larger than word_mode | |
1071 | size, we can just do an overlapping compare and we're | |
1072 | done. We fall through to this code from the word_mode | |
1073 | compare that preceeds this. */ | |
1074 | do_overlap_load_compare (load_mode, false, 0, diff, | |
1075 | cmp_rem, dcond, src1_addr, src2_addr, | |
1076 | orig_src1, orig_src2); | |
1077 | ||
1078 | rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label); | |
1079 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); | |
1080 | JUMP_LABEL (j) = diff_label; | |
1081 | LABEL_NUSES (diff_label) += 1; | |
1082 | emit_barrier (); | |
1083 | ||
1084 | /* If we couldn't do the overlap compare we have to be more | |
1085 | careful of the 4k boundary. Test to see if either | |
1086 | address is less than word_mode_size away from a 4k | |
1087 | boundary. If not, then we can do a load/shift/compare | |
1088 | and we are done. We come to this code if length was less | |
1089 | than word_mode_size. */ | |
1090 | ||
1091 | emit_label (final_cleanup); | |
1092 | ||
1093 | /* We can still avoid the slow case if the length was larger | |
1094 | than one loop iteration, in which case go do the overlap | |
1095 | load compare path. */ | |
1096 | do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes), | |
1097 | NULL_RTX, nonconst_overlap); | |
1098 | ||
1099 | rtx rem4k = gen_reg_rtx (word_mode); | |
1100 | rtx dist1 = gen_reg_rtx (word_mode); | |
1101 | rtx dist2 = gen_reg_rtx (word_mode); | |
1102 | do_sub3 (rem4k, GEN_INT (4096), cmp_rem); | |
1103 | if (word_mode == SImode) | |
1104 | emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff))); | |
1105 | else | |
1106 | emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff))); | |
1107 | do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, handle4k_label); | |
1108 | if (word_mode == SImode) | |
1109 | emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff))); | |
1110 | else | |
1111 | emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff))); | |
1112 | do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, handle4k_label); | |
1113 | ||
1114 | /* We don't have a 4k boundary to deal with, so do | |
1115 | a load/shift/compare and jump to diff. */ | |
1116 | ||
1117 | do_load_mask_compare (load_mode, diff, cmp_rem, dcond, | |
1118 | src1_addr, src2_addr, orig_src1, orig_src2); | |
1119 | ||
1120 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); | |
1121 | JUMP_LABEL (j) = diff_label; | |
1122 | LABEL_NUSES (diff_label) += 1; | |
1123 | emit_barrier (); | |
1124 | ||
1125 | /* Finally in the unlikely case we are inching up to a | |
1126 | 4k boundary we use a compact lbzx/compare loop to do | |
1127 | it a byte at a time. */ | |
1128 | ||
1129 | emit_label (handle4k_label); | |
1130 | ||
1131 | rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); | |
1132 | emit_move_insn (ctr, cmp_rem); | |
1133 | rtx ixreg = gen_reg_rtx (Pmode); | |
1134 | emit_move_insn (ixreg, const0_rtx); | |
1135 | ||
1136 | rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg); | |
1137 | rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg); | |
1138 | rtx d1 = gen_reg_rtx (word_mode); | |
1139 | rtx d2 = gen_reg_rtx (word_mode); | |
1140 | ||
1141 | rtx fc_loop = gen_label_rtx (); | |
1142 | emit_label (fc_loop); | |
1143 | ||
1144 | do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1); | |
1145 | do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2); | |
1146 | ||
1147 | do_add3 (ixreg, ixreg, const1_rtx); | |
1148 | ||
1149 | rtx cond = gen_reg_rtx (CCmode); | |
1150 | rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2); | |
1151 | rs6000_emit_dot_insn (diff, subexpr, 2, cond); | |
1152 | ||
1153 | rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2); | |
1154 | if (TARGET_64BIT) | |
1155 | j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr, | |
1156 | eqrtx, cond)); | |
1157 | else | |
1158 | j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr, | |
1159 | eqrtx, cond)); | |
1160 | JUMP_LABEL (j) = fc_loop; | |
1161 | LABEL_NUSES (fc_loop) += 1; | |
1162 | ||
1163 | if (TARGET_64BIT) | |
1164 | emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); | |
1165 | else | |
1166 | emit_move_insn (target, diff); | |
1167 | ||
1168 | /* Since we are comparing bytes, the difference can be used | |
1169 | as the final result and we are done here. */ | |
1170 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); | |
1171 | JUMP_LABEL (j) = final_label; | |
1172 | LABEL_NUSES (final_label) += 1; | |
1173 | emit_barrier (); | |
1174 | } | |
1175 | } | |
1176 | ||
1177 | emit_label (diff_label); | |
1178 | /* difference handling, 64->32 conversion */ | |
1179 | ||
1180 | /* We need to produce DI result from sub, then convert to target SI | |
1181 | while maintaining <0 / ==0 / >0 properties. This sequence works: | |
1182 | subfc L,A,B | |
1183 | subfe H,H,H | |
1184 | popcntd L,L | |
1185 | rldimi L,H,6,0 | |
1186 | ||
1187 | This is an alternate one Segher cooked up if somebody | |
1188 | wants to expand this for something that doesn't have popcntd: | |
1189 | subfc L,a,b | |
1190 | subfe H,x,x | |
1191 | addic t,L,-1 | |
1192 | subfe v,t,L | |
1193 | or z,v,H | |
1194 | ||
1195 | And finally, p9 can just do this: | |
1196 | cmpld A,B | |
1197 | setb r */ | |
1198 | ||
1199 | if (TARGET_P9_MISC) | |
1200 | emit_insn (gen_setb_unsigned (target, dcond)); | |
1201 | else | |
1202 | { | |
1203 | if (TARGET_64BIT) | |
1204 | { | |
1205 | rtx tmp_reg_ca = gen_reg_rtx (DImode); | |
1206 | emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); | |
1207 | emit_insn (gen_popcntddi2 (diff, diff)); | |
1208 | emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca)); | |
1209 | emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); | |
1210 | } | |
1211 | else | |
1212 | { | |
1213 | rtx tmp_reg_ca = gen_reg_rtx (SImode); | |
1214 | emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); | |
1215 | emit_insn (gen_popcntdsi2 (diff, diff)); | |
1216 | emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca)); | |
1217 | } | |
1218 | } | |
1219 | ||
1220 | if (library_call_label != NULL) | |
1221 | { | |
1222 | /* Branch around memcmp call. */ | |
1223 | j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); | |
1224 | JUMP_LABEL (j) = final_label; | |
1225 | LABEL_NUSES (final_label) += 1; | |
1226 | emit_barrier (); | |
1227 | ||
1228 | /* Make memcmp library call. cmp_rem is the remaining bytes that | |
1229 | were compared and cmp_rem is the expected amount to be compared | |
1230 | by memcmp. If we don't find a difference in the loop compare, do | |
1231 | the library call directly instead of doing a small compare just | |
1232 | to get to an arbitrary boundary before calling it anyway. | |
1233 | Also, update addresses to point to the next word to examine. */ | |
1234 | emit_label (library_call_label); | |
1235 | ||
1236 | rtx len_rtx = gen_reg_rtx (word_mode); | |
1237 | if (bytes_is_const) | |
1238 | { | |
1239 | emit_move_insn (len_rtx, cmp_rem); | |
1240 | do_add3 (src1_addr, src1_addr, iv1); | |
1241 | do_add3 (src2_addr, src2_addr, iv1); | |
1242 | } | |
1243 | else | |
1244 | emit_move_insn (len_rtx, bytes_rtx); | |
1245 | ||
1246 | tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP); | |
1247 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
1248 | target, LCT_NORMAL, GET_MODE (target), | |
1249 | src1_addr, Pmode, | |
1250 | src2_addr, Pmode, | |
1251 | len_rtx, GET_MODE (len_rtx)); | |
1252 | } | |
1253 | ||
1254 | /* emit final_label */ | |
1255 | emit_label (final_label); | |
1256 | return true; | |
1257 | } | |
1258 | ||
8845cb37 AS |
1259 | /* Expand a block compare operation, and return true if successful. |
1260 | Return false if we should let the compiler generate normal code, | |
1261 | probably a memcmp call. | |
1262 | ||
1263 | OPERANDS[0] is the target (result). | |
1264 | OPERANDS[1] is the first source. | |
1265 | OPERANDS[2] is the second source. | |
1266 | OPERANDS[3] is the length. | |
1267 | OPERANDS[4] is the alignment. */ | |
1268 | bool | |
1269 | expand_block_compare (rtx operands[]) | |
1270 | { | |
1271 | rtx target = operands[0]; | |
1272 | rtx orig_src1 = operands[1]; | |
1273 | rtx orig_src2 = operands[2]; | |
1274 | rtx bytes_rtx = operands[3]; | |
1275 | rtx align_rtx = operands[4]; | |
1276 | HOST_WIDE_INT cmp_bytes = 0; | |
1277 | rtx src1 = orig_src1; | |
1278 | rtx src2 = orig_src2; | |
1279 | ||
1280 | /* This case is complicated to handle because the subtract | |
1281 | with carry instructions do not generate the 64-bit | |
1282 | carry and so we must emit code to calculate it ourselves. | |
1283 | We choose not to implement this yet. */ | |
1284 | if (TARGET_32BIT && TARGET_POWERPC64) | |
1285 | return false; | |
1286 | ||
5ec3397e AS |
1287 | bool isP7 = (rs6000_tune == PROCESSOR_POWER7); |
1288 | ||
1289 | /* Allow this param to shut off all expansion. */ | |
1290 | if (rs6000_block_compare_inline_limit == 0) | |
1291 | return false; | |
1292 | ||
1293 | /* targetm.slow_unaligned_access -- don't do unaligned stuff. | |
1294 | However slow_unaligned_access returns true on P7 even though the | |
1295 | performance of this code is good there. */ | |
1296 | if (!isP7 | |
1297 | && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1)) | |
1298 | || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2)))) | |
8845cb37 AS |
1299 | return false; |
1300 | ||
5ec3397e AS |
1301 | /* Unaligned l*brx traps on P7 so don't do this. However this should |
1302 | not affect much because LE isn't really supported on P7 anyway. */ | |
1303 | if (isP7 && !BYTES_BIG_ENDIAN) | |
1304 | return false; | |
1305 | ||
1306 | /* If this is not a fixed size compare, try generating loop code and | |
1307 | if that fails just call memcmp. */ | |
1308 | if (!CONST_INT_P (bytes_rtx)) | |
1309 | return expand_compare_loop (operands); | |
1310 | ||
8845cb37 AS |
1311 | /* This must be a fixed size alignment. */ |
1312 | if (!CONST_INT_P (align_rtx)) | |
1313 | return false; | |
1314 | ||
1315 | unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT; | |
1316 | ||
8845cb37 AS |
1317 | gcc_assert (GET_MODE (target) == SImode); |
1318 | ||
1319 | /* Anything to move? */ | |
1320 | unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx); | |
1321 | if (bytes == 0) | |
1322 | return true; | |
1323 | ||
8845cb37 AS |
1324 | rtx tmp_reg_src1 = gen_reg_rtx (word_mode); |
1325 | rtx tmp_reg_src2 = gen_reg_rtx (word_mode); | |
1326 | /* P7/P8 code uses cond for subfc. but P9 uses | |
1327 | it for cmpld which needs CCUNSmode. */ | |
1328 | rtx cond; | |
1329 | if (TARGET_P9_MISC) | |
1330 | cond = gen_reg_rtx (CCUNSmode); | |
1331 | else | |
1332 | cond = gen_reg_rtx (CCmode); | |
1333 | ||
1334 | /* If we have an LE target without ldbrx and word_mode is DImode, | |
1335 | then we must avoid using word_mode. */ | |
1336 | int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX | |
1337 | && word_mode == DImode); | |
1338 | ||
1339 | /* Strategy phase. How many ops will this take and should we expand it? */ | |
1340 | ||
1341 | unsigned HOST_WIDE_INT offset = 0; | |
1342 | machine_mode load_mode = | |
1343 | select_block_compare_mode (offset, bytes, base_align, word_mode_ok); | |
1344 | unsigned int load_mode_size = GET_MODE_SIZE (load_mode); | |
1345 | ||
5ec3397e AS |
1346 | /* We don't want to generate too much code. The loop code can take |
1347 | over for lengths greater than 31 bytes. */ | |
1348 | unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit; | |
8845cb37 | 1349 | if (!IN_RANGE (bytes, 1, max_bytes)) |
5ec3397e AS |
1350 | return expand_compare_loop (operands); |
1351 | ||
1352 | /* The code generated for p7 and older is not faster than glibc | |
1353 | memcmp if alignment is small and length is not short, so bail | |
1354 | out to avoid those conditions. */ | |
1355 | if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED | |
1356 | && ((base_align == 1 && bytes > 16) | |
1357 | || (base_align == 2 && bytes > 32))) | |
8845cb37 AS |
1358 | return false; |
1359 | ||
1360 | bool generate_6432_conversion = false; | |
1361 | rtx convert_label = NULL; | |
1362 | rtx final_label = NULL; | |
1363 | ||
1364 | /* Example of generated code for 18 bytes aligned 1 byte. | |
1365 | Compiled with -fno-reorder-blocks for clarity. | |
1366 | ldbrx 10,31,8 | |
1367 | ldbrx 9,7,8 | |
1368 | subfc. 9,9,10 | |
1369 | bne 0,.L6487 | |
1370 | addi 9,12,8 | |
1371 | addi 5,11,8 | |
1372 | ldbrx 10,0,9 | |
1373 | ldbrx 9,0,5 | |
1374 | subfc. 9,9,10 | |
1375 | bne 0,.L6487 | |
1376 | addi 9,12,16 | |
1377 | lhbrx 10,0,9 | |
1378 | addi 9,11,16 | |
1379 | lhbrx 9,0,9 | |
1380 | subf 9,9,10 | |
1381 | b .L6488 | |
1382 | .p2align 4,,15 | |
1383 | .L6487: #convert_label | |
1384 | popcntd 9,9 | |
1385 | subfe 10,10,10 | |
1386 | or 9,9,10 | |
1387 | .L6488: #final_label | |
1388 | extsw 10,9 | |
1389 | ||
1390 | We start off with DImode for two blocks that jump to the DI->SI conversion | |
1391 | if the difference is found there, then a final block of HImode that skips | |
1392 | the DI->SI conversion. */ | |
1393 | ||
1394 | while (bytes > 0) | |
1395 | { | |
1396 | unsigned int align = compute_current_alignment (base_align, offset); | |
1397 | if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
1398 | load_mode = select_block_compare_mode (offset, bytes, align, | |
1399 | word_mode_ok); | |
1400 | else | |
1401 | load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok); | |
1402 | load_mode_size = GET_MODE_SIZE (load_mode); | |
1403 | if (bytes >= load_mode_size) | |
1404 | cmp_bytes = load_mode_size; | |
1405 | else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
1406 | { | |
1407 | /* Move this load back so it doesn't go past the end. | |
1408 | P8/P9 can do this efficiently. */ | |
1409 | unsigned int extra_bytes = load_mode_size - bytes; | |
1410 | cmp_bytes = bytes; | |
1411 | if (extra_bytes < offset) | |
1412 | { | |
1413 | offset -= extra_bytes; | |
1414 | cmp_bytes = load_mode_size; | |
1415 | bytes = cmp_bytes; | |
1416 | } | |
1417 | } | |
1418 | else | |
1419 | /* P7 and earlier can't do the overlapping load trick fast, | |
1420 | so this forces a non-overlapping load and a shift to get | |
1421 | rid of the extra bytes. */ | |
1422 | cmp_bytes = bytes; | |
1423 | ||
1424 | src1 = adjust_address (orig_src1, load_mode, offset); | |
1425 | src2 = adjust_address (orig_src2, load_mode, offset); | |
1426 | ||
1427 | if (!REG_P (XEXP (src1, 0))) | |
1428 | { | |
1429 | rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
1430 | src1 = replace_equiv_address (src1, src1_reg); | |
1431 | } | |
f4f867ca | 1432 | set_mem_size (src1, load_mode_size); |
8845cb37 AS |
1433 | |
1434 | if (!REG_P (XEXP (src2, 0))) | |
1435 | { | |
1436 | rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
1437 | src2 = replace_equiv_address (src2, src2_reg); | |
1438 | } | |
f4f867ca | 1439 | set_mem_size (src2, load_mode_size); |
8845cb37 AS |
1440 | |
1441 | do_load_for_compare (tmp_reg_src1, src1, load_mode); | |
1442 | do_load_for_compare (tmp_reg_src2, src2, load_mode); | |
1443 | ||
1444 | if (cmp_bytes < load_mode_size) | |
1445 | { | |
1446 | /* Shift unneeded bytes off. */ | |
1447 | rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes)); | |
1448 | if (word_mode == DImode) | |
1449 | { | |
1450 | emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
1451 | emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
1452 | } | |
1453 | else | |
1454 | { | |
1455 | emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
1456 | emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
1457 | } | |
1458 | } | |
1459 | ||
1460 | int remain = bytes - cmp_bytes; | |
1461 | if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode)) | |
1462 | { | |
1463 | /* Target is larger than load size so we don't need to | |
1464 | reduce result size. */ | |
1465 | ||
1466 | /* We previously did a block that need 64->32 conversion but | |
1467 | the current block does not, so a label is needed to jump | |
1468 | to the end. */ | |
1469 | if (generate_6432_conversion && !final_label) | |
1470 | final_label = gen_label_rtx (); | |
1471 | ||
1472 | if (remain > 0) | |
1473 | { | |
1474 | /* This is not the last block, branch to the end if the result | |
1475 | of this subtract is not zero. */ | |
1476 | if (!final_label) | |
1477 | final_label = gen_label_rtx (); | |
1478 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1479 | rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); | |
1480 | rtx cr = gen_reg_rtx (CCmode); | |
1481 | rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr); | |
1482 | emit_insn (gen_movsi (target, | |
1483 | gen_lowpart (SImode, tmp_reg_src2))); | |
1484 | rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx); | |
1485 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, | |
1486 | fin_ref, pc_rtx); | |
1487 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
1488 | JUMP_LABEL (j) = final_label; | |
1489 | LABEL_NUSES (final_label) += 1; | |
1490 | } | |
1491 | else | |
1492 | { | |
1493 | if (word_mode == DImode) | |
1494 | { | |
1495 | emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1, | |
1496 | tmp_reg_src2)); | |
1497 | emit_insn (gen_movsi (target, | |
1498 | gen_lowpart (SImode, tmp_reg_src2))); | |
1499 | } | |
1500 | else | |
1501 | emit_insn (gen_subsi3 (target, tmp_reg_src1, tmp_reg_src2)); | |
1502 | ||
1503 | if (final_label) | |
1504 | { | |
1505 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1506 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
5ec3397e | 1507 | JUMP_LABEL (j) = final_label; |
8845cb37 AS |
1508 | LABEL_NUSES (final_label) += 1; |
1509 | emit_barrier (); | |
1510 | } | |
1511 | } | |
1512 | } | |
1513 | else | |
1514 | { | |
1515 | /* Do we need a 64->32 conversion block? We need the 64->32 | |
1516 | conversion even if target size == load_mode size because | |
1517 | the subtract generates one extra bit. */ | |
1518 | generate_6432_conversion = true; | |
1519 | ||
1520 | if (remain > 0) | |
1521 | { | |
1522 | if (!convert_label) | |
1523 | convert_label = gen_label_rtx (); | |
1524 | ||
1525 | /* Compare to zero and branch to convert_label if not zero. */ | |
1526 | rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label); | |
1527 | if (TARGET_P9_MISC) | |
1528 | { | |
1529 | /* Generate a compare, and convert with a setb later. */ | |
1530 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, | |
1531 | tmp_reg_src2); | |
1532 | emit_insn (gen_rtx_SET (cond, cmp)); | |
1533 | } | |
1534 | else | |
1535 | /* Generate a subfc. and use the longer | |
1536 | sequence for conversion. */ | |
1537 | if (TARGET_64BIT) | |
1538 | emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2, | |
1539 | tmp_reg_src1, cond)); | |
1540 | else | |
1541 | emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2, | |
1542 | tmp_reg_src1, cond)); | |
1543 | rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
1544 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, | |
1545 | cvt_ref, pc_rtx); | |
1546 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
5ec3397e | 1547 | JUMP_LABEL (j) = convert_label; |
8845cb37 AS |
1548 | LABEL_NUSES (convert_label) += 1; |
1549 | } | |
1550 | else | |
1551 | { | |
1552 | /* Just do the subtract/compare. Since this is the last block | |
1553 | the convert code will be generated immediately following. */ | |
1554 | if (TARGET_P9_MISC) | |
1555 | { | |
1556 | rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, | |
1557 | tmp_reg_src2); | |
1558 | emit_insn (gen_rtx_SET (cond, cmp)); | |
1559 | } | |
1560 | else | |
1561 | if (TARGET_64BIT) | |
1562 | emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2, | |
1563 | tmp_reg_src1)); | |
1564 | else | |
1565 | emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2, | |
1566 | tmp_reg_src1)); | |
1567 | } | |
1568 | } | |
1569 | ||
1570 | offset += cmp_bytes; | |
1571 | bytes -= cmp_bytes; | |
1572 | } | |
1573 | ||
1574 | if (generate_6432_conversion) | |
1575 | { | |
1576 | if (convert_label) | |
1577 | emit_label (convert_label); | |
1578 | ||
1579 | /* We need to produce DI result from sub, then convert to target SI | |
1580 | while maintaining <0 / ==0 / >0 properties. This sequence works: | |
1581 | subfc L,A,B | |
1582 | subfe H,H,H | |
1583 | popcntd L,L | |
1584 | rldimi L,H,6,0 | |
1585 | ||
1586 | This is an alternate one Segher cooked up if somebody | |
1587 | wants to expand this for something that doesn't have popcntd: | |
1588 | subfc L,a,b | |
1589 | subfe H,x,x | |
1590 | addic t,L,-1 | |
1591 | subfe v,t,L | |
1592 | or z,v,H | |
1593 | ||
1594 | And finally, p9 can just do this: | |
1595 | cmpld A,B | |
1596 | setb r */ | |
1597 | ||
1598 | if (TARGET_P9_MISC) | |
1599 | { | |
1600 | emit_insn (gen_setb_unsigned (target, cond)); | |
1601 | } | |
1602 | else | |
1603 | { | |
1604 | if (TARGET_64BIT) | |
1605 | { | |
1606 | rtx tmp_reg_ca = gen_reg_rtx (DImode); | |
1607 | emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); | |
1608 | emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2)); | |
1609 | emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca)); | |
1610 | emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2))); | |
1611 | } | |
1612 | else | |
1613 | { | |
1614 | rtx tmp_reg_ca = gen_reg_rtx (SImode); | |
1615 | emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); | |
1616 | emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2)); | |
1617 | emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca)); | |
1618 | } | |
1619 | } | |
1620 | } | |
1621 | ||
1622 | if (final_label) | |
1623 | emit_label (final_label); | |
1624 | ||
1625 | gcc_assert (bytes == 0); | |
1626 | return true; | |
1627 | } | |
1628 | ||
1629 | /* Generate alignment check and branch code to set up for | |
1630 | strncmp when we don't have DI alignment. | |
1631 | STRNCMP_LABEL is the label to branch if there is a page crossing. | |
1632 | SRC is the string pointer to be examined. | |
1633 | BYTES is the max number of bytes to compare. */ | |
1634 | static void | |
1635 | expand_strncmp_align_check (rtx strncmp_label, rtx src, HOST_WIDE_INT bytes) | |
1636 | { | |
1637 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label); | |
1638 | rtx src_check = copy_addr_to_reg (XEXP (src, 0)); | |
1639 | if (GET_MODE (src_check) == SImode) | |
1640 | emit_insn (gen_andsi3 (src_check, src_check, GEN_INT (0xfff))); | |
1641 | else | |
1642 | emit_insn (gen_anddi3 (src_check, src_check, GEN_INT (0xfff))); | |
1643 | rtx cond = gen_reg_rtx (CCmode); | |
1644 | emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_check, | |
1645 | GEN_INT (4096 - bytes))); | |
1646 | ||
0c791c59 | 1647 | rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx); |
8845cb37 AS |
1648 | |
1649 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
0c791c59 | 1650 | lab_ref, pc_rtx); |
8845cb37 AS |
1651 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); |
1652 | JUMP_LABEL (j) = strncmp_label; | |
1653 | LABEL_NUSES (strncmp_label) += 1; | |
1654 | } | |
1655 | ||
1656 | /* Expand a string compare operation with length, and return | |
1657 | true if successful. Return false if we should let the | |
1658 | compiler generate normal code, probably a strncmp call. | |
1659 | ||
1660 | OPERANDS[0] is the target (result). | |
1661 | OPERANDS[1] is the first source. | |
1662 | OPERANDS[2] is the second source. | |
1663 | If NO_LENGTH is zero, then: | |
1664 | OPERANDS[3] is the length. | |
1665 | OPERANDS[4] is the alignment in bytes. | |
1666 | If NO_LENGTH is nonzero, then: | |
1667 | OPERANDS[3] is the alignment in bytes. */ | |
1668 | bool | |
1669 | expand_strn_compare (rtx operands[], int no_length) | |
1670 | { | |
1671 | rtx target = operands[0]; | |
1672 | rtx orig_src1 = operands[1]; | |
1673 | rtx orig_src2 = operands[2]; | |
1674 | rtx bytes_rtx, align_rtx; | |
1675 | if (no_length) | |
1676 | { | |
1677 | bytes_rtx = NULL; | |
1678 | align_rtx = operands[3]; | |
1679 | } | |
1680 | else | |
1681 | { | |
1682 | bytes_rtx = operands[3]; | |
1683 | align_rtx = operands[4]; | |
1684 | } | |
1685 | unsigned HOST_WIDE_INT cmp_bytes = 0; | |
1686 | rtx src1 = orig_src1; | |
1687 | rtx src2 = orig_src2; | |
1688 | ||
1689 | /* If we have a length, it must be constant. This simplifies things | |
1690 | a bit as we don't have to generate code to check if we've exceeded | |
1691 | the length. Later this could be expanded to handle this case. */ | |
1692 | if (!no_length && !CONST_INT_P (bytes_rtx)) | |
1693 | return false; | |
1694 | ||
1695 | /* This must be a fixed size alignment. */ | |
1696 | if (!CONST_INT_P (align_rtx)) | |
1697 | return false; | |
1698 | ||
1699 | unsigned int base_align = UINTVAL (align_rtx); | |
1700 | int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; | |
1701 | int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; | |
1702 | ||
e0bd6c9f RS |
1703 | /* targetm.slow_unaligned_access -- don't do unaligned stuff. */ |
1704 | if (targetm.slow_unaligned_access (word_mode, align1) | |
1705 | || targetm.slow_unaligned_access (word_mode, align2)) | |
8845cb37 AS |
1706 | return false; |
1707 | ||
1708 | gcc_assert (GET_MODE (target) == SImode); | |
1709 | ||
1710 | /* If we have an LE target without ldbrx and word_mode is DImode, | |
1711 | then we must avoid using word_mode. */ | |
1712 | int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX | |
1713 | && word_mode == DImode); | |
1714 | ||
1715 | unsigned int word_mode_size = GET_MODE_SIZE (word_mode); | |
1716 | ||
1717 | unsigned HOST_WIDE_INT offset = 0; | |
1718 | unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */ | |
1719 | unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */ | |
1720 | if (no_length) | |
1721 | /* Use this as a standin to determine the mode to use. */ | |
1722 | bytes = rs6000_string_compare_inline_limit * word_mode_size; | |
1723 | else | |
1724 | bytes = UINTVAL (bytes_rtx); | |
1725 | ||
1726 | machine_mode load_mode = | |
1727 | select_block_compare_mode (offset, bytes, base_align, word_mode_ok); | |
1728 | unsigned int load_mode_size = GET_MODE_SIZE (load_mode); | |
1729 | compare_length = rs6000_string_compare_inline_limit * load_mode_size; | |
1730 | ||
1731 | /* If we have equality at the end of the last compare and we have not | |
1732 | found the end of the string, we need to call strcmp/strncmp to | |
1733 | compare the remainder. */ | |
1734 | bool equality_compare_rest = false; | |
1735 | ||
1736 | if (no_length) | |
1737 | { | |
1738 | bytes = compare_length; | |
1739 | equality_compare_rest = true; | |
1740 | } | |
1741 | else | |
1742 | { | |
1743 | if (bytes <= compare_length) | |
1744 | compare_length = bytes; | |
1745 | else | |
1746 | equality_compare_rest = true; | |
1747 | } | |
1748 | ||
1749 | rtx result_reg = gen_reg_rtx (word_mode); | |
1750 | rtx final_move_label = gen_label_rtx (); | |
1751 | rtx final_label = gen_label_rtx (); | |
1752 | rtx begin_compare_label = NULL; | |
1753 | ||
1754 | if (base_align < 8) | |
1755 | { | |
1756 | /* Generate code that checks distance to 4k boundary for this case. */ | |
1757 | begin_compare_label = gen_label_rtx (); | |
1758 | rtx strncmp_label = gen_label_rtx (); | |
1759 | rtx jmp; | |
1760 | ||
1761 | /* Strncmp for power8 in glibc does this: | |
5ec3397e AS |
1762 | rldicl r8,r3,0,52 |
1763 | cmpldi cr7,r8,4096-16 | |
1764 | bgt cr7,L(pagecross) */ | |
8845cb37 AS |
1765 | |
1766 | /* Make sure that the length we use for the alignment test and | |
1767 | the subsequent code generation are in agreement so we do not | |
1768 | go past the length we tested for a 4k boundary crossing. */ | |
1769 | unsigned HOST_WIDE_INT align_test = compare_length; | |
1770 | if (align_test < 8) | |
1771 | { | |
1772 | align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test); | |
1773 | base_align = align_test; | |
1774 | } | |
1775 | else | |
1776 | { | |
1777 | align_test = ROUND_UP (align_test, 8); | |
1778 | base_align = 8; | |
1779 | } | |
1780 | ||
1781 | if (align1 < 8) | |
1782 | expand_strncmp_align_check (strncmp_label, src1, align_test); | |
1783 | if (align2 < 8) | |
1784 | expand_strncmp_align_check (strncmp_label, src2, align_test); | |
1785 | ||
1786 | /* Now generate the following sequence: | |
1787 | - branch to begin_compare | |
1788 | - strncmp_label | |
1789 | - call to strncmp | |
1790 | - branch to final_label | |
1791 | - begin_compare_label */ | |
1792 | ||
1793 | rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label); | |
1794 | jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref)); | |
1795 | JUMP_LABEL (jmp) = begin_compare_label; | |
1796 | LABEL_NUSES (begin_compare_label) += 1; | |
1797 | emit_barrier (); | |
1798 | ||
1799 | emit_label (strncmp_label); | |
1800 | ||
1801 | if (!REG_P (XEXP (src1, 0))) | |
1802 | { | |
1803 | rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
1804 | src1 = replace_equiv_address (src1, src1_reg); | |
1805 | } | |
1806 | ||
1807 | if (!REG_P (XEXP (src2, 0))) | |
1808 | { | |
1809 | rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
1810 | src2 = replace_equiv_address (src2, src2_reg); | |
1811 | } | |
1812 | ||
1813 | if (no_length) | |
1814 | { | |
1815 | tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); | |
1816 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 1817 | target, LCT_NORMAL, GET_MODE (target), |
8845cb37 AS |
1818 | force_reg (Pmode, XEXP (src1, 0)), Pmode, |
1819 | force_reg (Pmode, XEXP (src2, 0)), Pmode); | |
1820 | } | |
1821 | else | |
1822 | { | |
1823 | /* -m32 -mpowerpc64 results in word_mode being DImode even | |
1824 | though otherwise it is 32-bit. The length arg to strncmp | |
1825 | is a size_t which will be the same size as pointers. */ | |
1826 | rtx len_rtx; | |
1827 | if (TARGET_64BIT) | |
1828 | len_rtx = gen_reg_rtx (DImode); | |
1829 | else | |
1830 | len_rtx = gen_reg_rtx (SImode); | |
1831 | ||
1832 | emit_move_insn (len_rtx, bytes_rtx); | |
1833 | ||
1834 | tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); | |
1835 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 1836 | target, LCT_NORMAL, GET_MODE (target), |
8845cb37 AS |
1837 | force_reg (Pmode, XEXP (src1, 0)), Pmode, |
1838 | force_reg (Pmode, XEXP (src2, 0)), Pmode, | |
1839 | len_rtx, GET_MODE (len_rtx)); | |
1840 | } | |
1841 | ||
1842 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
1843 | jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
1844 | JUMP_LABEL (jmp) = final_label; | |
1845 | LABEL_NUSES (final_label) += 1; | |
1846 | emit_barrier (); | |
1847 | emit_label (begin_compare_label); | |
1848 | } | |
1849 | ||
1850 | rtx cleanup_label = NULL; | |
1851 | rtx tmp_reg_src1 = gen_reg_rtx (word_mode); | |
1852 | rtx tmp_reg_src2 = gen_reg_rtx (word_mode); | |
1853 | ||
1854 | /* Generate sequence of ld/ldbrx, cmpb to compare out | |
1855 | to the length specified. */ | |
1856 | unsigned HOST_WIDE_INT bytes_to_compare = compare_length; | |
1857 | while (bytes_to_compare > 0) | |
1858 | { | |
1859 | /* Compare sequence: | |
1860 | check each 8B with: ld/ld cmpd bne | |
1861 | If equal, use rldicr/cmpb to check for zero byte. | |
1862 | cleanup code at end: | |
1863 | cmpb get byte that differs | |
1864 | cmpb look for zero byte | |
1865 | orc combine | |
1866 | cntlzd get bit of first zero/diff byte | |
1867 | subfic convert for rldcl use | |
1868 | rldcl rldcl extract diff/zero byte | |
1869 | subf subtract for final result | |
1870 | ||
1871 | The last compare can branch around the cleanup code if the | |
1872 | result is zero because the strings are exactly equal. */ | |
1873 | unsigned int align = compute_current_alignment (base_align, offset); | |
1874 | if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
1875 | load_mode = select_block_compare_mode (offset, bytes_to_compare, align, | |
1876 | word_mode_ok); | |
1877 | else | |
1878 | load_mode = select_block_compare_mode (0, bytes_to_compare, align, | |
1879 | word_mode_ok); | |
1880 | load_mode_size = GET_MODE_SIZE (load_mode); | |
1881 | if (bytes_to_compare >= load_mode_size) | |
1882 | cmp_bytes = load_mode_size; | |
1883 | else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) | |
1884 | { | |
1885 | /* Move this load back so it doesn't go past the end. | |
1886 | P8/P9 can do this efficiently. */ | |
1887 | unsigned int extra_bytes = load_mode_size - bytes_to_compare; | |
1888 | cmp_bytes = bytes_to_compare; | |
1889 | if (extra_bytes < offset) | |
1890 | { | |
1891 | offset -= extra_bytes; | |
1892 | cmp_bytes = load_mode_size; | |
1893 | bytes_to_compare = cmp_bytes; | |
1894 | } | |
1895 | } | |
1896 | else | |
1897 | /* P7 and earlier can't do the overlapping load trick fast, | |
1898 | so this forces a non-overlapping load and a shift to get | |
1899 | rid of the extra bytes. */ | |
1900 | cmp_bytes = bytes_to_compare; | |
1901 | ||
1902 | src1 = adjust_address (orig_src1, load_mode, offset); | |
1903 | src2 = adjust_address (orig_src2, load_mode, offset); | |
1904 | ||
1905 | if (!REG_P (XEXP (src1, 0))) | |
1906 | { | |
1907 | rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
1908 | src1 = replace_equiv_address (src1, src1_reg); | |
1909 | } | |
708eab9b | 1910 | set_mem_size (src1, load_mode_size); |
8845cb37 AS |
1911 | |
1912 | if (!REG_P (XEXP (src2, 0))) | |
1913 | { | |
1914 | rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
1915 | src2 = replace_equiv_address (src2, src2_reg); | |
1916 | } | |
708eab9b | 1917 | set_mem_size (src2, load_mode_size); |
8845cb37 AS |
1918 | |
1919 | do_load_for_compare (tmp_reg_src1, src1, load_mode); | |
1920 | do_load_for_compare (tmp_reg_src2, src2, load_mode); | |
1921 | ||
1922 | /* We must always left-align the data we read, and | |
1923 | clear any bytes to the right that are beyond the string. | |
1924 | Otherwise the cmpb sequence won't produce the correct | |
1925 | results. The beginning of the compare will be done | |
1926 | with word_mode so will not have any extra shifts or | |
1927 | clear rights. */ | |
1928 | ||
1929 | if (load_mode_size < word_mode_size) | |
1930 | { | |
1931 | /* Rotate left first. */ | |
1932 | rtx sh = GEN_INT (BITS_PER_UNIT * (word_mode_size - load_mode_size)); | |
1933 | if (word_mode == DImode) | |
1934 | { | |
1935 | emit_insn (gen_rotldi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
1936 | emit_insn (gen_rotldi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
1937 | } | |
1938 | else | |
1939 | { | |
1940 | emit_insn (gen_rotlsi3 (tmp_reg_src1, tmp_reg_src1, sh)); | |
1941 | emit_insn (gen_rotlsi3 (tmp_reg_src2, tmp_reg_src2, sh)); | |
1942 | } | |
1943 | } | |
1944 | ||
1945 | if (cmp_bytes < word_mode_size) | |
1946 | { | |
1947 | /* Now clear right. This plus the rotate can be | |
1948 | turned into a rldicr instruction. */ | |
1949 | HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes); | |
1950 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); | |
1951 | if (word_mode == DImode) | |
1952 | { | |
1953 | emit_insn (gen_anddi3_mask (tmp_reg_src1, tmp_reg_src1, mask)); | |
1954 | emit_insn (gen_anddi3_mask (tmp_reg_src2, tmp_reg_src2, mask)); | |
1955 | } | |
1956 | else | |
1957 | { | |
1958 | emit_insn (gen_andsi3_mask (tmp_reg_src1, tmp_reg_src1, mask)); | |
1959 | emit_insn (gen_andsi3_mask (tmp_reg_src2, tmp_reg_src2, mask)); | |
1960 | } | |
1961 | } | |
1962 | ||
1963 | /* Cases to handle. A and B are chunks of the two strings. | |
1964 | 1: Not end of comparison: | |
1965 | A != B: branch to cleanup code to compute result. | |
1966 | A == B: check for 0 byte, next block if not found. | |
1967 | 2: End of the inline comparison: | |
1968 | A != B: branch to cleanup code to compute result. | |
1969 | A == B: check for 0 byte, call strcmp/strncmp | |
1970 | 3: compared requested N bytes: | |
1971 | A == B: branch to result 0. | |
1972 | A != B: cleanup code to compute result. */ | |
1973 | ||
1974 | unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; | |
1975 | ||
1976 | rtx dst_label; | |
1977 | if (remain > 0 || equality_compare_rest) | |
1978 | { | |
1979 | /* Branch to cleanup code, otherwise fall through to do | |
1980 | more compares. */ | |
1981 | if (!cleanup_label) | |
1982 | cleanup_label = gen_label_rtx (); | |
1983 | dst_label = cleanup_label; | |
1984 | } | |
1985 | else | |
1986 | /* Branch to end and produce result of 0. */ | |
1987 | dst_label = final_move_label; | |
1988 | ||
1989 | rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); | |
1990 | rtx cond = gen_reg_rtx (CCmode); | |
1991 | ||
1992 | /* Always produce the 0 result, it is needed if | |
1993 | cmpb finds a 0 byte in this chunk. */ | |
1994 | rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); | |
1995 | rs6000_emit_dot_insn (result_reg, tmp, 1, cond); | |
1996 | ||
1997 | rtx cmp_rtx; | |
1998 | if (remain == 0 && !equality_compare_rest) | |
1999 | cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx); | |
2000 | else | |
2001 | cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); | |
2002 | ||
2003 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, | |
2004 | lab_ref, pc_rtx); | |
2005 | rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
2006 | JUMP_LABEL (j) = dst_label; | |
2007 | LABEL_NUSES (dst_label) += 1; | |
2008 | ||
2009 | if (remain > 0 || equality_compare_rest) | |
2010 | { | |
2011 | /* Generate a cmpb to test for a 0 byte and branch | |
2012 | to final result if found. */ | |
2013 | rtx cmpb_zero = gen_reg_rtx (word_mode); | |
2014 | rtx lab_ref_fin = gen_rtx_LABEL_REF (VOIDmode, final_move_label); | |
2015 | rtx condz = gen_reg_rtx (CCmode); | |
2016 | rtx zero_reg = gen_reg_rtx (word_mode); | |
2017 | if (word_mode == SImode) | |
2018 | { | |
2019 | emit_insn (gen_movsi (zero_reg, GEN_INT (0))); | |
2020 | emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg)); | |
2021 | if (cmp_bytes < word_mode_size) | |
2022 | { | |
2023 | /* Don't want to look at zero bytes past end. */ | |
2024 | HOST_WIDE_INT mb = | |
2025 | BITS_PER_UNIT * (word_mode_size - cmp_bytes); | |
2026 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); | |
2027 | emit_insn (gen_andsi3_mask (cmpb_zero, cmpb_zero, mask)); | |
2028 | } | |
2029 | } | |
2030 | else | |
2031 | { | |
2032 | emit_insn (gen_movdi (zero_reg, GEN_INT (0))); | |
2033 | emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg)); | |
2034 | if (cmp_bytes < word_mode_size) | |
2035 | { | |
2036 | /* Don't want to look at zero bytes past end. */ | |
2037 | HOST_WIDE_INT mb = | |
2038 | BITS_PER_UNIT * (word_mode_size - cmp_bytes); | |
2039 | rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); | |
2040 | emit_insn (gen_anddi3_mask (cmpb_zero, cmpb_zero, mask)); | |
2041 | } | |
2042 | } | |
2043 | ||
2044 | emit_move_insn (condz, gen_rtx_COMPARE (CCmode, cmpb_zero, zero_reg)); | |
2045 | rtx cmpnz_rtx = gen_rtx_NE (VOIDmode, condz, const0_rtx); | |
2046 | rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmpnz_rtx, | |
2047 | lab_ref_fin, pc_rtx); | |
2048 | rtx j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); | |
2049 | JUMP_LABEL (j2) = final_move_label; | |
2050 | LABEL_NUSES (final_move_label) += 1; | |
2051 | ||
2052 | } | |
2053 | ||
2054 | offset += cmp_bytes; | |
2055 | bytes_to_compare -= cmp_bytes; | |
2056 | } | |
2057 | ||
2058 | if (equality_compare_rest) | |
2059 | { | |
2060 | /* Update pointers past what has been compared already. */ | |
2061 | src1 = adjust_address (orig_src1, load_mode, offset); | |
2062 | src2 = adjust_address (orig_src2, load_mode, offset); | |
2063 | ||
2064 | if (!REG_P (XEXP (src1, 0))) | |
2065 | { | |
2066 | rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); | |
2067 | src1 = replace_equiv_address (src1, src1_reg); | |
2068 | } | |
708eab9b | 2069 | set_mem_size (src1, load_mode_size); |
8845cb37 AS |
2070 | |
2071 | if (!REG_P (XEXP (src2, 0))) | |
2072 | { | |
2073 | rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); | |
2074 | src2 = replace_equiv_address (src2, src2_reg); | |
2075 | } | |
708eab9b | 2076 | set_mem_size (src2, load_mode_size); |
8845cb37 AS |
2077 | |
2078 | /* Construct call to strcmp/strncmp to compare the rest of the string. */ | |
2079 | if (no_length) | |
2080 | { | |
2081 | tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); | |
2082 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2083 | target, LCT_NORMAL, GET_MODE (target), |
8845cb37 AS |
2084 | force_reg (Pmode, XEXP (src1, 0)), Pmode, |
2085 | force_reg (Pmode, XEXP (src2, 0)), Pmode); | |
2086 | } | |
2087 | else | |
2088 | { | |
2089 | rtx len_rtx; | |
2090 | if (TARGET_64BIT) | |
2091 | len_rtx = gen_reg_rtx (DImode); | |
2092 | else | |
2093 | len_rtx = gen_reg_rtx (SImode); | |
2094 | ||
2095 | emit_move_insn (len_rtx, GEN_INT (bytes - compare_length)); | |
2096 | tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); | |
2097 | emit_library_call_value (XEXP (DECL_RTL (fun), 0), | |
db69559b | 2098 | target, LCT_NORMAL, GET_MODE (target), |
8845cb37 AS |
2099 | force_reg (Pmode, XEXP (src1, 0)), Pmode, |
2100 | force_reg (Pmode, XEXP (src2, 0)), Pmode, | |
2101 | len_rtx, GET_MODE (len_rtx)); | |
2102 | } | |
2103 | ||
2104 | rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); | |
2105 | rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); | |
2106 | JUMP_LABEL (jmp) = final_label; | |
2107 | LABEL_NUSES (final_label) += 1; | |
2108 | emit_barrier (); | |
2109 | } | |
2110 | ||
2111 | if (cleanup_label) | |
2112 | emit_label (cleanup_label); | |
2113 | ||
2114 | /* Generate the final sequence that identifies the differing | |
2115 | byte and generates the final result, taking into account | |
2116 | zero bytes: | |
2117 | ||
2118 | cmpb cmpb_result1, src1, src2 | |
2119 | cmpb cmpb_result2, src1, zero | |
2120 | orc cmpb_result1, cmp_result1, cmpb_result2 | |
2121 | cntlzd get bit of first zero/diff byte | |
2122 | addi convert for rldcl use | |
2123 | rldcl rldcl extract diff/zero byte | |
2124 | subf subtract for final result | |
2125 | */ | |
2126 | ||
2127 | rtx cmpb_diff = gen_reg_rtx (word_mode); | |
2128 | rtx cmpb_zero = gen_reg_rtx (word_mode); | |
2129 | rtx rot_amt = gen_reg_rtx (word_mode); | |
2130 | rtx zero_reg = gen_reg_rtx (word_mode); | |
2131 | ||
2132 | rtx rot1_1 = gen_reg_rtx (word_mode); | |
2133 | rtx rot1_2 = gen_reg_rtx (word_mode); | |
2134 | rtx rot2_1 = gen_reg_rtx (word_mode); | |
2135 | rtx rot2_2 = gen_reg_rtx (word_mode); | |
2136 | ||
2137 | if (word_mode == SImode) | |
2138 | { | |
2139 | emit_insn (gen_cmpbsi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2)); | |
2140 | emit_insn (gen_movsi (zero_reg, GEN_INT (0))); | |
2141 | emit_insn (gen_cmpbsi3 (cmpb_zero, tmp_reg_src1, zero_reg)); | |
2142 | emit_insn (gen_one_cmplsi2 (cmpb_diff,cmpb_diff)); | |
2143 | emit_insn (gen_iorsi3 (cmpb_diff, cmpb_diff, cmpb_zero)); | |
2144 | emit_insn (gen_clzsi2 (rot_amt, cmpb_diff)); | |
2145 | emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8))); | |
2146 | emit_insn (gen_rotlsi3 (rot1_1, tmp_reg_src1, | |
2147 | gen_lowpart (SImode, rot_amt))); | |
2148 | emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); | |
2149 | emit_insn (gen_rotlsi3 (rot2_1, tmp_reg_src2, | |
2150 | gen_lowpart (SImode, rot_amt))); | |
2151 | emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); | |
2152 | emit_insn (gen_subsi3 (result_reg, rot1_2, rot2_2)); | |
2153 | } | |
2154 | else | |
2155 | { | |
2156 | emit_insn (gen_cmpbdi3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2)); | |
2157 | emit_insn (gen_movdi (zero_reg, GEN_INT (0))); | |
2158 | emit_insn (gen_cmpbdi3 (cmpb_zero, tmp_reg_src1, zero_reg)); | |
2159 | emit_insn (gen_one_cmpldi2 (cmpb_diff,cmpb_diff)); | |
2160 | emit_insn (gen_iordi3 (cmpb_diff, cmpb_diff, cmpb_zero)); | |
2161 | emit_insn (gen_clzdi2 (rot_amt, cmpb_diff)); | |
2162 | emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8))); | |
2163 | emit_insn (gen_rotldi3 (rot1_1, tmp_reg_src1, | |
2164 | gen_lowpart (SImode, rot_amt))); | |
2165 | emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); | |
2166 | emit_insn (gen_rotldi3 (rot2_1, tmp_reg_src2, | |
2167 | gen_lowpart (SImode, rot_amt))); | |
2168 | emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); | |
2169 | emit_insn (gen_subdi3 (result_reg, rot1_2, rot2_2)); | |
2170 | } | |
2171 | ||
2172 | emit_label (final_move_label); | |
2173 | emit_insn (gen_movsi (target, | |
2174 | gen_lowpart (SImode, result_reg))); | |
2175 | emit_label (final_label); | |
2176 | return true; | |
2177 | } | |
2178 | ||
2179 | /* Expand a block move operation, and return 1 if successful. Return 0 | |
2180 | if we should let the compiler generate normal code. | |
2181 | ||
2182 | operands[0] is the destination | |
2183 | operands[1] is the source | |
2184 | operands[2] is the length | |
2185 | operands[3] is the alignment */ | |
2186 | ||
2187 | #define MAX_MOVE_REG 4 | |
2188 | ||
2189 | int | |
2190 | expand_block_move (rtx operands[]) | |
2191 | { | |
2192 | rtx orig_dest = operands[0]; | |
2193 | rtx orig_src = operands[1]; | |
2194 | rtx bytes_rtx = operands[2]; | |
2195 | rtx align_rtx = operands[3]; | |
2196 | int constp = (GET_CODE (bytes_rtx) == CONST_INT); | |
2197 | int align; | |
2198 | int bytes; | |
2199 | int offset; | |
2200 | int move_bytes; | |
2201 | rtx stores[MAX_MOVE_REG]; | |
2202 | int num_reg = 0; | |
2203 | ||
2204 | /* If this is not a fixed size move, just call memcpy */ | |
2205 | if (! constp) | |
2206 | return 0; | |
2207 | ||
2208 | /* This must be a fixed size alignment */ | |
2209 | gcc_assert (GET_CODE (align_rtx) == CONST_INT); | |
2210 | align = INTVAL (align_rtx) * BITS_PER_UNIT; | |
2211 | ||
2212 | /* Anything to move? */ | |
2213 | bytes = INTVAL (bytes_rtx); | |
2214 | if (bytes <= 0) | |
2215 | return 1; | |
2216 | ||
2217 | if (bytes > rs6000_block_move_inline_limit) | |
2218 | return 0; | |
2219 | ||
2220 | for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes) | |
2221 | { | |
2222 | union { | |
2223 | rtx (*movmemsi) (rtx, rtx, rtx, rtx); | |
2224 | rtx (*mov) (rtx, rtx); | |
2225 | } gen_func; | |
2226 | machine_mode mode = BLKmode; | |
2227 | rtx src, dest; | |
2228 | ||
2229 | /* Altivec first, since it will be faster than a string move | |
2230 | when it applies, and usually not significantly larger. */ | |
3b0cb1a5 | 2231 | if (TARGET_ALTIVEC && bytes >= 16 && (TARGET_EFFICIENT_UNALIGNED_VSX || align >= 128)) |
8845cb37 AS |
2232 | { |
2233 | move_bytes = 16; | |
2234 | mode = V4SImode; | |
2235 | gen_func.mov = gen_movv4si; | |
2236 | } | |
8845cb37 AS |
2237 | else if (bytes >= 8 && TARGET_POWERPC64 |
2238 | && (align >= 64 || !STRICT_ALIGNMENT)) | |
2239 | { | |
2240 | move_bytes = 8; | |
2241 | mode = DImode; | |
2242 | gen_func.mov = gen_movdi; | |
2243 | if (offset == 0 && align < 64) | |
2244 | { | |
2245 | rtx addr; | |
2246 | ||
2247 | /* If the address form is reg+offset with offset not a | |
2248 | multiple of four, reload into reg indirect form here | |
2249 | rather than waiting for reload. This way we get one | |
2250 | reload, not one per load and/or store. */ | |
2251 | addr = XEXP (orig_dest, 0); | |
2252 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
2253 | && GET_CODE (XEXP (addr, 1)) == CONST_INT | |
2254 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) | |
2255 | { | |
2256 | addr = copy_addr_to_reg (addr); | |
2257 | orig_dest = replace_equiv_address (orig_dest, addr); | |
2258 | } | |
2259 | addr = XEXP (orig_src, 0); | |
2260 | if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) | |
2261 | && GET_CODE (XEXP (addr, 1)) == CONST_INT | |
2262 | && (INTVAL (XEXP (addr, 1)) & 3) != 0) | |
2263 | { | |
2264 | addr = copy_addr_to_reg (addr); | |
2265 | orig_src = replace_equiv_address (orig_src, addr); | |
2266 | } | |
2267 | } | |
2268 | } | |
8845cb37 AS |
2269 | else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) |
2270 | { /* move 4 bytes */ | |
2271 | move_bytes = 4; | |
2272 | mode = SImode; | |
2273 | gen_func.mov = gen_movsi; | |
2274 | } | |
2275 | else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) | |
2276 | { /* move 2 bytes */ | |
2277 | move_bytes = 2; | |
2278 | mode = HImode; | |
2279 | gen_func.mov = gen_movhi; | |
2280 | } | |
8845cb37 AS |
2281 | else /* move 1 byte at a time */ |
2282 | { | |
2283 | move_bytes = 1; | |
2284 | mode = QImode; | |
2285 | gen_func.mov = gen_movqi; | |
2286 | } | |
2287 | ||
2288 | src = adjust_address (orig_src, mode, offset); | |
2289 | dest = adjust_address (orig_dest, mode, offset); | |
2290 | ||
2291 | if (mode != BLKmode) | |
2292 | { | |
2293 | rtx tmp_reg = gen_reg_rtx (mode); | |
2294 | ||
2295 | emit_insn ((*gen_func.mov) (tmp_reg, src)); | |
2296 | stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); | |
2297 | } | |
2298 | ||
2299 | if (mode == BLKmode || num_reg >= MAX_MOVE_REG || bytes == move_bytes) | |
2300 | { | |
2301 | int i; | |
2302 | for (i = 0; i < num_reg; i++) | |
2303 | emit_insn (stores[i]); | |
2304 | num_reg = 0; | |
2305 | } | |
2306 | ||
2307 | if (mode == BLKmode) | |
2308 | { | |
2309 | /* Move the address into scratch registers. The movmemsi | |
2310 | patterns require zero offset. */ | |
2311 | if (!REG_P (XEXP (src, 0))) | |
2312 | { | |
2313 | rtx src_reg = copy_addr_to_reg (XEXP (src, 0)); | |
2314 | src = replace_equiv_address (src, src_reg); | |
2315 | } | |
2316 | set_mem_size (src, move_bytes); | |
2317 | ||
2318 | if (!REG_P (XEXP (dest, 0))) | |
2319 | { | |
2320 | rtx dest_reg = copy_addr_to_reg (XEXP (dest, 0)); | |
2321 | dest = replace_equiv_address (dest, dest_reg); | |
2322 | } | |
2323 | set_mem_size (dest, move_bytes); | |
2324 | ||
2325 | emit_insn ((*gen_func.movmemsi) (dest, src, | |
2326 | GEN_INT (move_bytes & 31), | |
2327 | align_rtx)); | |
2328 | } | |
2329 | } | |
2330 | ||
2331 | return 1; | |
2332 | } |