1 /* Subroutines used to expand string operations for RISC-V.
2 Copyright (C) 2023-2024 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published
8 by the Free Software Foundation; either version 3, or (at your
9 option) any later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
14 License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #define IN_TARGET_CODE 1
24 #include "coretypes.h"
31 #include "print-tree.h"
39 #include "riscv-protos.h"
41 #include "tm-constrs.h"
43 /* Emit proper instruction depending on mode of dest. */
45 #define GEN_EMIT_HELPER2(name) \
47 do_## name ## 2(rtx dest, rtx src) \
50 if (GET_MODE (dest) == DImode) \
51 insn = emit_insn (gen_ ## name ## di2 (dest, src)); \
53 insn = emit_insn (gen_ ## name ## si2 (dest, src)); \
57 /* Emit proper instruction depending on mode of dest. */
59 #define GEN_EMIT_HELPER3(name) \
61 do_## name ## 3(rtx dest, rtx src1, rtx src2) \
64 if (GET_MODE (dest) == DImode) \
65 insn = emit_insn (gen_ ## name ## di3 (dest, src1, src2)); \
67 insn = emit_insn (gen_ ## name ## si3 (dest, src1, src2)); \
71 GEN_EMIT_HELPER3(add
) /* do_add3 */
72 GEN_EMIT_HELPER3(and) /* do_and3 */
73 GEN_EMIT_HELPER3(ashl
) /* do_ashl3 */
74 GEN_EMIT_HELPER2(bswap
) /* do_bswap2 */
75 GEN_EMIT_HELPER2(clz
) /* do_clz2 */
76 GEN_EMIT_HELPER2(ctz
) /* do_ctz2 */
77 GEN_EMIT_HELPER3(ior
) /* do_ior3 */
78 GEN_EMIT_HELPER3(ior_not
) /* do_ior_not3 */
79 GEN_EMIT_HELPER3(lshr
) /* do_lshr3 */
80 GEN_EMIT_HELPER2(neg
) /* do_neg2 */
81 GEN_EMIT_HELPER2(orcb
) /* do_orcb2 */
82 GEN_EMIT_HELPER2(one_cmpl
) /* do_one_cmpl2 */
83 GEN_EMIT_HELPER3(rotr
) /* do_rotr3 */
84 GEN_EMIT_HELPER3(sub
) /* do_sub3 */
85 GEN_EMIT_HELPER2(th_rev
) /* do_th_rev2 */
86 GEN_EMIT_HELPER2(th_tstnbz
) /* do_th_tstnbz2 */
87 GEN_EMIT_HELPER3(xor) /* do_xor3 */
88 GEN_EMIT_HELPER2(zero_extendqi
) /* do_zero_extendqi2 */
90 #undef GEN_EMIT_HELPER2
91 #undef GEN_EMIT_HELPER3
93 /* Helper function to load a byte or a Pmode register.
95 MODE is the mode to use for the load (QImode or Pmode).
96 DEST is the destination register for the data.
97 ADDR_REG is the register that holds the address.
98 ADDR is the address expression to load from.
100 This function returns an rtx containing the register,
101 where the ADDR is stored. */
104 do_load_from_addr (machine_mode mode
, rtx dest
, rtx addr_reg
, rtx addr
)
106 rtx mem
= gen_rtx_MEM (mode
, addr_reg
);
107 MEM_COPY_ATTRIBUTES (mem
, addr
);
108 set_mem_size (mem
, GET_MODE_SIZE (mode
));
111 do_zero_extendqi2 (dest
, mem
);
112 else if (mode
== Xmode
)
113 emit_move_insn (dest
, mem
);
120 /* Generate a sequence to compare single characters in data1 and data2.
122 RESULT is the register where the return value of str(n)cmp will be stored.
123 DATA1 is a register which contains character1.
124 DATA2 is a register which contains character2.
125 FINAL_LABEL is the location after the calculation of the return value. */
128 emit_strcmp_scalar_compare_byte (rtx result
, rtx data1
, rtx data2
,
131 rtx tmp
= gen_reg_rtx (Xmode
);
132 do_sub3 (tmp
, data1
, data2
);
133 emit_insn (gen_movsi (result
, gen_lowpart (SImode
, tmp
)));
134 emit_jump_insn (gen_jump (final_label
));
135 emit_barrier (); /* No fall-through. */
138 /* Generate a sequence to compare two strings in data1 and data2.
140 DATA1 is a register which contains string1.
141 DATA2 is a register which contains string2.
142 ORC1 is a register where orc.b(data1) will be stored.
143 CMP_BYTES is the length of the strings.
144 END_LABEL is the location of the code that calculates the return value. */
147 emit_strcmp_scalar_compare_subword (rtx data1
, rtx data2
, rtx orc1
,
148 unsigned HOST_WIDE_INT cmp_bytes
,
151 /* Set a NUL-byte after the relevant data (behind the string). */
152 long long im
= -256ll;
153 rtx imask
= gen_rtx_CONST_INT (Xmode
, im
);
154 rtx m_reg
= gen_reg_rtx (Xmode
);
155 emit_insn (gen_rtx_SET (m_reg
, imask
));
156 do_rotr3 (m_reg
, m_reg
, GEN_INT (64 - cmp_bytes
* BITS_PER_UNIT
));
157 do_and3 (data1
, m_reg
, data1
);
158 do_and3 (data2
, m_reg
, data2
);
160 do_orcb2 (orc1
, data1
);
162 do_th_tstnbz2 (orc1
, data1
);
163 emit_jump_insn (gen_jump (end_label
));
164 emit_barrier (); /* No fall-through. */
167 /* Generate a sequence to compare two strings in data1 and data2.
169 DATA1 is a register which contains string1.
170 DATA2 is a register which contains string2.
171 ORC1 is a register where orc.b(data1) will be stored.
172 TESTVAL is the value to test ORC1 against.
173 END_LABEL is the location of the code that calculates the return value.
174 NONUL_END_LABEL is the location of the code that calculates the return value
175 in case the first string does not contain a NULL-byte. */
178 emit_strcmp_scalar_compare_word (rtx data1
, rtx data2
, rtx orc1
, rtx testval
,
179 rtx end_label
, rtx nonul_end_label
)
181 /* Check if data1 contains a NUL character. */
183 do_orcb2 (orc1
, data1
);
185 do_th_tstnbz2 (orc1
, data1
);
186 rtx cond1
= gen_rtx_NE (VOIDmode
, orc1
, testval
);
187 emit_unlikely_jump_insn (gen_cbranch4 (Pmode
, cond1
, orc1
, testval
,
189 /* Break out if u1 != u2 */
190 rtx cond2
= gen_rtx_NE (VOIDmode
, data1
, data2
);
191 emit_unlikely_jump_insn (gen_cbranch4 (Pmode
, cond2
, data1
,
192 data2
, nonul_end_label
));
193 /* Fall-through on equality. */
196 /* Generate the sequence of compares for strcmp/strncmp using zbb instructions.
198 RESULT is the register where the return value of str(n)cmp will be stored.
199 The strings are referenced by SRC1 and SRC2.
200 The number of bytes to compare is defined by NBYTES.
201 DATA1 is a register where string1 will be stored.
202 DATA2 is a register where string2 will be stored.
203 ORC1 is a register where orc.b(data1) will be stored.
204 END_LABEL is the location of the code that calculates the return value.
205 NONUL_END_LABEL is the location of the code that calculates the return value
206 in case the first string does not contain a NULL-byte.
207 FINAL_LABEL is the location of the code that comes after the calculation
208 of the return value. */
211 emit_strcmp_scalar_load_and_compare (rtx result
, rtx src1
, rtx src2
,
212 unsigned HOST_WIDE_INT nbytes
,
213 rtx data1
, rtx data2
, rtx orc1
,
214 rtx end_label
, rtx nonul_end_label
,
217 const unsigned HOST_WIDE_INT xlen
= GET_MODE_SIZE (Xmode
);
218 rtx src1_addr
= force_reg (Pmode
, XEXP (src1
, 0));
219 rtx src2_addr
= force_reg (Pmode
, XEXP (src2
, 0));
220 unsigned HOST_WIDE_INT offset
= 0;
222 rtx testval
= gen_reg_rtx (Xmode
);
224 emit_insn (gen_rtx_SET (testval
, constm1_rtx
));
226 emit_insn (gen_rtx_SET (testval
, const0_rtx
));
230 unsigned HOST_WIDE_INT cmp_bytes
= xlen
< nbytes
? xlen
: nbytes
;
231 machine_mode load_mode
;
237 rtx addr1
= gen_rtx_PLUS (Pmode
, src1_addr
, GEN_INT (offset
));
238 do_load_from_addr (load_mode
, data1
, addr1
, src1
);
239 rtx addr2
= gen_rtx_PLUS (Pmode
, src2_addr
, GEN_INT (offset
));
240 do_load_from_addr (load_mode
, data2
, addr2
, src2
);
244 emit_strcmp_scalar_compare_byte (result
, data1
, data2
, final_label
);
247 else if (cmp_bytes
< xlen
)
249 emit_strcmp_scalar_compare_subword (data1
, data2
, orc1
,
250 cmp_bytes
, end_label
);
254 emit_strcmp_scalar_compare_word (data1
, data2
, orc1
, testval
,
255 end_label
, nonul_end_label
);
262 /* Fixup pointers and generate a call to strcmp.
264 RESULT is the register where the return value of str(n)cmp will be stored.
265 The strings are referenced by SRC1 and SRC2.
266 The number of already compared bytes is defined by NBYTES. */
269 emit_strcmp_scalar_call_to_libc (rtx result
, rtx src1
, rtx src2
,
270 unsigned HOST_WIDE_INT nbytes
)
272 /* Update pointers past what has been compared already. */
273 rtx src1_addr
= force_reg (Pmode
, XEXP (src1
, 0));
274 rtx src2_addr
= force_reg (Pmode
, XEXP (src2
, 0));
275 rtx src1_new
= force_reg (Pmode
,
276 gen_rtx_PLUS (Pmode
, src1_addr
, GEN_INT (nbytes
)));
277 rtx src2_new
= force_reg (Pmode
,
278 gen_rtx_PLUS (Pmode
, src2_addr
, GEN_INT (nbytes
)));
280 /* Construct call to strcmp to compare the rest of the string. */
281 tree fun
= builtin_decl_explicit (BUILT_IN_STRCMP
);
282 emit_library_call_value (XEXP (DECL_RTL (fun
), 0),
283 result
, LCT_NORMAL
, GET_MODE (result
),
284 src1_new
, Pmode
, src2_new
, Pmode
);
287 /* Fast strcmp-result calculation if no NULL-byte in string1.
289 RESULT is the register where the return value of str(n)cmp will be stored.
290 The mismatching strings are stored in DATA1 and DATA2. */
293 emit_strcmp_scalar_result_calculation_nonul (rtx result
, rtx data1
, rtx data2
)
295 /* Words don't match, and no NUL byte in one word.
296 Get bytes in big-endian order and compare as words. */
297 do_bswap2 (data1
, data1
);
298 do_bswap2 (data2
, data2
);
299 /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
300 rtx tmp
= gen_reg_rtx (Xmode
);
301 emit_insn (gen_slt_3 (LTU
, Xmode
, Xmode
, tmp
, data1
, data2
));
303 do_ior3 (tmp
, tmp
, const1_rtx
);
304 emit_insn (gen_movsi (result
, gen_lowpart (SImode
, tmp
)));
307 /* strcmp-result calculation.
309 RESULT is the register where the return value of str(n)cmp will be stored.
310 The strings are stored in DATA1 and DATA2.
311 ORC1 contains orc.b(DATA1). */
314 emit_strcmp_scalar_result_calculation (rtx result
, rtx data1
, rtx data2
,
317 const unsigned HOST_WIDE_INT xlen
= GET_MODE_SIZE (Xmode
);
319 /* Convert non-equal bytes into non-NUL bytes. */
320 rtx diff
= gen_reg_rtx (Xmode
);
321 do_xor3 (diff
, data1
, data2
);
322 rtx shift
= gen_reg_rtx (Xmode
);
326 /* Convert non-equal or NUL-bytes into non-NUL bytes. */
327 rtx syndrome
= gen_reg_rtx (Xmode
);
328 do_orcb2 (diff
, diff
);
329 do_ior_not3 (syndrome
, orc1
, diff
);
330 /* Count the number of equal bits from the beginning of the word. */
331 do_ctz2 (shift
, syndrome
);
335 /* Convert non-equal or NUL-bytes into non-NUL bytes. */
336 rtx syndrome
= gen_reg_rtx (Xmode
);
337 do_th_tstnbz2 (diff
, diff
);
338 do_one_cmpl2 (diff
, diff
);
339 do_ior3 (syndrome
, orc1
, diff
);
340 /* Count the number of equal bits from the beginning of the word. */
341 do_th_rev2 (syndrome
, syndrome
);
342 do_clz2 (shift
, syndrome
);
345 do_bswap2 (data1
, data1
);
346 do_bswap2 (data2
, data2
);
348 /* The most-significant-non-zero bit of the syndrome marks either the
349 first bit that is different, or the top bit of the first zero byte.
350 Shifting left now will bring the critical information into the
352 do_ashl3 (data1
, data1
, gen_lowpart (QImode
, shift
));
353 do_ashl3 (data2
, data2
, gen_lowpart (QImode
, shift
));
355 /* But we need to zero-extend (char is unsigned) the value and then
356 perform a signed 32-bit subtraction. */
357 unsigned int shiftr
= (xlen
- 1) * BITS_PER_UNIT
;
358 do_lshr3 (data1
, data1
, GEN_INT (shiftr
));
359 do_lshr3 (data2
, data2
, GEN_INT (shiftr
));
360 rtx tmp
= gen_reg_rtx (Xmode
);
361 do_sub3 (tmp
, data1
, data2
);
362 emit_insn (gen_movsi (result
, gen_lowpart (SImode
, tmp
)));
365 /* Expand str(n)cmp using Zbb/TheadBb instructions.
367 The result will be stored in RESULT.
368 The strings are referenced by SRC1 and SRC2.
369 The number of bytes to compare is defined by NBYTES.
370 The alignment is defined by ALIGNMENT.
371 If NCOMPARE is false then libc's strcmp() will be called if comparing
372 NBYTES of both strings did not find differences or NULL-bytes.
374 Return true if expansion was successful, or false otherwise. */
377 riscv_expand_strcmp_scalar (rtx result
, rtx src1
, rtx src2
,
378 unsigned HOST_WIDE_INT nbytes
,
379 unsigned HOST_WIDE_INT alignment
,
382 const unsigned HOST_WIDE_INT xlen
= GET_MODE_SIZE (Xmode
);
384 gcc_assert (TARGET_ZBB
|| TARGET_XTHEADBB
);
385 gcc_assert (nbytes
> 0);
386 gcc_assert ((int)nbytes
<= riscv_strcmp_inline_limit
);
387 gcc_assert (ncompare
|| (nbytes
& (xlen
- 1)) == 0);
389 /* Limit to 12-bits (maximum load-offset). */
390 if (nbytes
> IMM_REACH
)
393 /* We don't support big endian. */
394 if (BYTES_BIG_ENDIAN
)
397 /* We need xlen-aligned strings. */
398 if (alignment
< xlen
)
401 /* Overall structure of emitted code:
403 - Load data1 and data2
404 - Set orc1 := orc.b (data1) (or th.tstnbz)
405 - Compare strings and either:
406 - Fall-through on equality
407 - Jump to nonul_end_label if data1 !or end_label
408 - Calculate result value and jump to final_label
410 Call-to-libc or set result to 0 (depending on ncompare)
412 nonul_end_label: // words don't match, and no null byte in first word.
413 Calculate result value with the use of data1, data2 and orc1
416 Calculate result value with the use of data1, data2 and orc1
421 rtx data1
= gen_reg_rtx (Xmode
);
422 rtx data2
= gen_reg_rtx (Xmode
);
423 rtx orc1
= gen_reg_rtx (Xmode
);
424 rtx nonul_end_label
= gen_label_rtx ();
425 rtx end_label
= gen_label_rtx ();
426 rtx final_label
= gen_label_rtx ();
428 /* Generate a sequence of zbb instructions to compare out
429 to the length specified. */
430 emit_strcmp_scalar_load_and_compare (result
, src1
, src2
, nbytes
,
432 end_label
, nonul_end_label
, final_label
);
434 /* All compared and everything was equal. */
437 emit_insn (gen_rtx_SET (result
, gen_rtx_CONST_INT (SImode
, 0)));
438 emit_jump_insn (gen_jump (final_label
));
439 emit_barrier (); /* No fall-through. */
443 emit_strcmp_scalar_call_to_libc (result
, src1
, src2
, nbytes
);
444 emit_jump_insn (gen_jump (final_label
));
445 emit_barrier (); /* No fall-through. */
449 emit_label (nonul_end_label
);
450 emit_strcmp_scalar_result_calculation_nonul (result
, data1
, data2
);
451 emit_jump_insn (gen_jump (final_label
));
452 emit_barrier (); /* No fall-through. */
454 emit_label (end_label
);
455 emit_strcmp_scalar_result_calculation (result
, data1
, data2
, orc1
);
456 emit_jump_insn (gen_jump (final_label
));
457 emit_barrier (); /* No fall-through. */
459 emit_label (final_label
);
463 /* Expand a string compare operation.
465 The result will be stored in RESULT.
466 The strings are referenced by SRC1 and SRC2.
467 The argument BYTES_RTX either holds the number of characters to
468 compare, or is NULL_RTX. The argument ALIGN_RTX holds the alignment.
470 Return true if expansion was successful, or false otherwise. */
473 riscv_expand_strcmp (rtx result
, rtx src1
, rtx src2
,
474 rtx bytes_rtx
, rtx align_rtx
)
476 unsigned HOST_WIDE_INT compare_max
;
477 unsigned HOST_WIDE_INT nbytes
;
478 unsigned HOST_WIDE_INT alignment
;
479 bool ncompare
= bytes_rtx
!= NULL_RTX
;
480 const unsigned HOST_WIDE_INT xlen
= GET_MODE_SIZE (Xmode
);
482 if (riscv_strcmp_inline_limit
== 0)
485 /* Round down the comparision limit to a multiple of xlen. */
486 compare_max
= riscv_strcmp_inline_limit
& ~(xlen
- 1);
488 /* Decide how many bytes to compare inline. */
489 if (bytes_rtx
== NULL_RTX
)
491 nbytes
= compare_max
;
495 /* If we have a length, it must be constant. */
496 if (!CONST_INT_P (bytes_rtx
))
498 nbytes
= UINTVAL (bytes_rtx
);
500 /* We don't emit parts of a strncmp() call. */
501 if (nbytes
> compare_max
)
507 - nbytes <= riscv_strcmp_inline_limit
508 - nbytes is a multiple of xlen if !ncompare */
510 if (!CONST_INT_P (align_rtx
))
512 alignment
= UINTVAL (align_rtx
);
514 if (TARGET_VECTOR
&& stringop_strategy
& STRATEGY_VECTOR
)
516 bool ok
= riscv_vector::expand_strcmp (result
, src1
, src2
,
517 bytes_rtx
, alignment
,
523 if ((TARGET_ZBB
|| TARGET_XTHEADBB
) && stringop_strategy
& STRATEGY_SCALAR
)
524 return riscv_expand_strcmp_scalar (result
, src1
, src2
, nbytes
, alignment
,
530 /* If the provided string is aligned, then read XLEN bytes
531 in a loop and use orc.b to find NUL-bytes. */
534 riscv_expand_strlen_scalar (rtx result
, rtx src
, rtx align
)
536 rtx testval
, addr
, addr_plus_regsz
, word
, zeros
;
537 rtx loop_label
, cond
;
539 gcc_assert (TARGET_ZBB
|| TARGET_XTHEADBB
);
541 /* The alignment needs to be known and big enough. */
542 if (!CONST_INT_P (align
) || UINTVAL (align
) < GET_MODE_SIZE (Xmode
))
545 testval
= gen_reg_rtx (Xmode
);
546 addr
= copy_addr_to_reg (XEXP (src
, 0));
547 addr_plus_regsz
= gen_reg_rtx (Pmode
);
548 word
= gen_reg_rtx (Xmode
);
549 zeros
= gen_reg_rtx (Xmode
);
552 emit_insn (gen_rtx_SET (testval
, constm1_rtx
));
554 emit_insn (gen_rtx_SET (testval
, const0_rtx
));
556 do_add3 (addr_plus_regsz
, addr
, GEN_INT (UNITS_PER_WORD
));
558 loop_label
= gen_label_rtx ();
559 emit_label (loop_label
);
561 /* Load a word and use orc.b/th.tstnbz to find a zero-byte. */
562 do_load_from_addr (Xmode
, word
, addr
, src
);
563 do_add3 (addr
, addr
, GEN_INT (UNITS_PER_WORD
));
565 do_orcb2 (word
, word
);
567 do_th_tstnbz2 (word
, word
);
568 cond
= gen_rtx_EQ (VOIDmode
, word
, testval
);
569 emit_unlikely_jump_insn (gen_cbranch4 (Xmode
, cond
, word
, testval
, loop_label
));
571 /* Calculate the return value by counting zero-bits. */
573 do_one_cmpl2 (word
, word
);
574 if (TARGET_BIG_ENDIAN
)
575 do_clz2 (zeros
, word
);
577 do_ctz2 (zeros
, word
);
580 do_th_rev2 (word
, word
);
581 do_clz2 (zeros
, word
);
584 do_lshr3 (zeros
, zeros
, GEN_INT (exact_log2 (BITS_PER_UNIT
)));
585 do_add3 (addr
, addr
, zeros
);
586 do_sub3 (result
, addr
, addr_plus_regsz
);
591 /* Expand a strlen operation and return true if successful.
592 Return false if we should let the compiler generate normal
593 code, probably a strlen call. */
596 riscv_expand_strlen (rtx result
, rtx src
, rtx search_char
, rtx align
)
598 if (TARGET_VECTOR
&& stringop_strategy
& STRATEGY_VECTOR
)
600 riscv_vector::expand_rawmemchr (E_QImode
, result
, src
, search_char
,
605 gcc_assert (search_char
== const0_rtx
);
607 if ((TARGET_ZBB
|| TARGET_XTHEADBB
) && stringop_strategy
& STRATEGY_SCALAR
)
608 return riscv_expand_strlen_scalar (result
, src
, align
);
613 /* Emit straight-line code to move LENGTH bytes from SRC to DEST.
614 Assume that the areas do not overlap. */
617 riscv_block_move_straight (rtx dest
, rtx src
, unsigned HOST_WIDE_INT length
)
619 unsigned HOST_WIDE_INT offset
, delta
;
620 unsigned HOST_WIDE_INT bits
;
622 enum machine_mode mode
;
625 bits
= MAX (BITS_PER_UNIT
,
626 MIN (BITS_PER_WORD
, MIN (MEM_ALIGN (src
), MEM_ALIGN (dest
))));
628 mode
= mode_for_size (bits
, MODE_INT
, 0).require ();
629 delta
= bits
/ BITS_PER_UNIT
;
631 /* Allocate a buffer for the temporary registers. */
632 regs
= XALLOCAVEC (rtx
, length
/ delta
);
634 /* Load as many BITS-sized chunks as possible. Use a normal load if
635 the source has enough alignment, otherwise use left/right pairs. */
636 for (offset
= 0, i
= 0; offset
+ delta
<= length
; offset
+= delta
, i
++)
638 regs
[i
] = gen_reg_rtx (mode
);
639 riscv_emit_move (regs
[i
], adjust_address (src
, mode
, offset
));
642 /* Copy the chunks to the destination. */
643 for (offset
= 0, i
= 0; offset
+ delta
<= length
; offset
+= delta
, i
++)
644 riscv_emit_move (adjust_address (dest
, mode
, offset
), regs
[i
]);
646 /* Mop up any left-over bytes. */
649 src
= adjust_address (src
, BLKmode
, offset
);
650 dest
= adjust_address (dest
, BLKmode
, offset
);
651 move_by_pieces (dest
, src
, length
- offset
,
652 MIN (MEM_ALIGN (src
), MEM_ALIGN (dest
)), RETURN_BEGIN
);
656 /* Helper function for doing a loop-based block operation on memory
657 reference MEM. Each iteration of the loop will operate on LENGTH
660 Create a new base register for use within the loop and point it to
661 the start of MEM. Create a new memory reference that uses this
662 register. Store them in *LOOP_REG and *LOOP_MEM respectively. */
665 riscv_adjust_block_mem (rtx mem
, unsigned HOST_WIDE_INT length
,
666 rtx
*loop_reg
, rtx
*loop_mem
)
668 *loop_reg
= copy_addr_to_reg (XEXP (mem
, 0));
670 /* Although the new mem does not refer to a known location,
671 it does keep up to LENGTH bytes of alignment. */
672 *loop_mem
= change_address (mem
, BLKmode
, *loop_reg
);
673 set_mem_align (*loop_mem
, MIN (MEM_ALIGN (mem
), length
* BITS_PER_UNIT
));
676 /* Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
677 bytes at a time. LENGTH must be at least BYTES_PER_ITER. Assume that
678 the memory regions do not overlap. */
681 riscv_block_move_loop (rtx dest
, rtx src
, unsigned HOST_WIDE_INT length
,
682 unsigned HOST_WIDE_INT bytes_per_iter
)
684 rtx label
, src_reg
, dest_reg
, final_src
, test
;
685 unsigned HOST_WIDE_INT leftover
;
687 leftover
= length
% bytes_per_iter
;
690 /* Create registers and memory references for use within the loop. */
691 riscv_adjust_block_mem (src
, bytes_per_iter
, &src_reg
, &src
);
692 riscv_adjust_block_mem (dest
, bytes_per_iter
, &dest_reg
, &dest
);
694 /* Calculate the value that SRC_REG should have after the last iteration
696 final_src
= expand_simple_binop (Pmode
, PLUS
, src_reg
, GEN_INT (length
),
699 /* Emit the start of the loop. */
700 label
= gen_label_rtx ();
703 /* Emit the loop body. */
704 riscv_block_move_straight (dest
, src
, bytes_per_iter
);
706 /* Move on to the next block. */
707 riscv_emit_move (src_reg
, plus_constant (Pmode
, src_reg
, bytes_per_iter
));
708 riscv_emit_move (dest_reg
, plus_constant (Pmode
, dest_reg
, bytes_per_iter
));
710 /* Emit the loop condition. */
711 test
= gen_rtx_NE (VOIDmode
, src_reg
, final_src
);
712 emit_jump_insn (gen_cbranch4 (Pmode
, test
, src_reg
, final_src
, label
));
714 /* Mop up any left-over bytes. */
716 riscv_block_move_straight (dest
, src
, leftover
);
718 emit_insn(gen_nop ());
721 /* Expand a cpymemsi instruction, which copies LENGTH bytes from
722 memory reference SRC to memory reference DEST. */
725 riscv_expand_block_move_scalar (rtx dest
, rtx src
, rtx length
)
727 if (!CONST_INT_P (length
))
730 unsigned HOST_WIDE_INT hwi_length
= UINTVAL (length
);
731 unsigned HOST_WIDE_INT factor
, align
;
733 align
= MIN (MIN (MEM_ALIGN (src
), MEM_ALIGN (dest
)), BITS_PER_WORD
);
734 factor
= BITS_PER_WORD
/ align
;
736 if (optimize_function_for_size_p (cfun
)
737 && hwi_length
* factor
* UNITS_PER_WORD
> MOVE_RATIO (false))
740 if (hwi_length
<= (RISCV_MAX_MOVE_BYTES_STRAIGHT
/ factor
))
742 riscv_block_move_straight (dest
, src
, INTVAL (length
));
745 else if (optimize
&& align
>= BITS_PER_WORD
)
747 unsigned min_iter_words
748 = RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER
/ UNITS_PER_WORD
;
749 unsigned iter_words
= min_iter_words
;
750 unsigned HOST_WIDE_INT bytes
= hwi_length
;
751 unsigned HOST_WIDE_INT words
= bytes
/ UNITS_PER_WORD
;
753 /* Lengthen the loop body if it shortens the tail. */
754 for (unsigned i
= min_iter_words
; i
< min_iter_words
* 2 - 1; i
++)
756 unsigned cur_cost
= iter_words
+ words
% iter_words
;
757 unsigned new_cost
= i
+ words
% i
;
758 if (new_cost
<= cur_cost
)
762 riscv_block_move_loop (dest
, src
, bytes
, iter_words
* UNITS_PER_WORD
);
769 /* This function delegates block-move expansion to either the vector
770 implementation or the scalar one. Return TRUE if successful or FALSE
774 riscv_expand_block_move (rtx dest
, rtx src
, rtx length
)
776 if (TARGET_VECTOR
&& stringop_strategy
& STRATEGY_VECTOR
)
778 bool ok
= riscv_vector::expand_block_move (dest
, src
, length
);
783 if (stringop_strategy
& STRATEGY_SCALAR
)
784 return riscv_expand_block_move_scalar (dest
, src
, length
);
789 /* --- Vector expanders --- */
791 namespace riscv_vector
{
793 /* Used by cpymemsi in riscv.md . */
796 expand_block_move (rtx dst_in
, rtx src_in
, rtx length_in
)
800 mv a3, a0 # Copy destination
802 vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b
803 vle8.v v0, (a1) # Load bytes
804 add a1, a1, t0 # Bump pointer
805 sub a2, a2, t0 # Decrement count
806 vse8.v v0, (a3) # Store bytes
807 add a3, a3, t0 # Bump pointer
808 bnez a2, loop # Any more?
811 gcc_assert (TARGET_VECTOR
);
813 HOST_WIDE_INT potential_ew
814 = (MIN (MIN (MEM_ALIGN (src_in
), MEM_ALIGN (dst_in
)), BITS_PER_WORD
)
816 machine_mode vmode
= VOIDmode
;
817 bool need_loop
= true;
818 bool size_p
= optimize_function_for_size_p (cfun
);
820 rtx end
= gen_reg_rtx (Pmode
);
822 rtx length_rtx
= length_in
;
824 if (CONST_INT_P (length_in
))
826 HOST_WIDE_INT length
= INTVAL (length_in
);
828 /* By using LMUL=8, we can copy as many bytes in one go as there
829 are bits in a vector register. If the entire block thus fits,
830 we don't need a loop. */
831 if (length
<= TARGET_MIN_VLEN
)
835 /* If a single scalar load / store pair can do the job, leave it
836 to the scalar code to do that. */
837 /* ??? If fast unaligned access is supported, the scalar code could
838 use suitably sized scalars irrespective of alignemnt. If that
839 gets fixed, we have to adjust the test here. */
841 if (pow2p_hwi (length
) && length
<= potential_ew
)
845 /* Find the vector mode to use. Using the largest possible element
846 size is likely to give smaller constants, and thus potentially
847 reducing code size. However, if we need a loop, we need to update
848 the pointers, and that is more complicated with a larger element
849 size, unless we use an immediate, which prevents us from dynamically
850 using the targets transfer size that the hart supports. And then,
851 unless we know the *exact* vector size of the hart, we'd need
852 multiple vsetvli / branch statements, so it's not even a size win.
853 If, in the future, we find an RISCV-V implementation that is slower
854 for small element widths, we might allow larger element widths for
858 for (; potential_ew
; potential_ew
>>= 1)
860 scalar_int_mode elem_mode
;
861 unsigned HOST_WIDE_INT bits
= potential_ew
* BITS_PER_UNIT
;
862 unsigned HOST_WIDE_INT per_iter
;
863 HOST_WIDE_INT nunits
;
866 per_iter
= TARGET_MIN_VLEN
;
869 nunits
= per_iter
/ potential_ew
;
871 /* Unless we get an implementation that's slow for small element
872 size / non-word-aligned accesses, we assume that the hardware
873 handles this well, and we don't want to complicate the code
874 with shifting word contents around or handling extra bytes at
875 the start and/or end. So we want the total transfer size and
876 alignment to fit with the element size. */
877 if (length
% potential_ew
!= 0
878 || !int_mode_for_size (bits
, 0).exists (&elem_mode
))
880 /* Find the mode to use for the copy inside the loop - or the
881 sole copy, if there is no loop. */
884 /* Try if we have an exact mode for the copy. */
885 if (riscv_vector::get_vector_mode (elem_mode
,
886 nunits
).exists (&vmode
))
888 /* Since we don't have a mode that exactlty matches the transfer
889 size, we'll need to use pred_store, which is not available
890 for all vector modes, but only iE_RVV_M* modes, hence trying
891 to find a vector mode for a merely rounded-up size is
893 Still, by choosing a lower LMUL factor that still allows
894 an entire transfer, we can reduce register pressure. */
895 for (unsigned lmul
= 1; lmul
<= 4; lmul
<<= 1)
896 if (TARGET_MIN_VLEN
* lmul
<= nunits
* BITS_PER_UNIT
897 /* Avoid loosing the option of using vsetivli . */
898 && (nunits
<= 31 * lmul
|| nunits
> 31 * 8)
899 && multiple_p (BYTES_PER_RISCV_VECTOR
* lmul
, potential_ew
)
900 && (riscv_vector::get_vector_mode
901 (elem_mode
, exact_div (BYTES_PER_RISCV_VECTOR
* lmul
,
902 potential_ew
)).exists (&vmode
)))
906 /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
907 wide. BYTES_PER_RISCV_VECTOR can't be eavenly divided by
908 the sizes of larger element types; the LMUL factor of 8 can at
909 the moment be divided by the SEW, with SEW of up to 8 bytes,
910 but there are reserved encodings so there might be larger
911 SEW in the future. */
912 if (riscv_vector::get_vector_mode
913 (elem_mode
, exact_div (BYTES_PER_RISCV_VECTOR
* 8,
914 potential_ew
)).exists (&vmode
))
917 /* We may get here if we tried an element size that's larger than
918 the hardware supports, but we should at least find a suitable
920 gcc_assert (potential_ew
> 1);
922 if (potential_ew
> 1)
923 length_rtx
= GEN_INT (length
/ potential_ew
);
927 vmode
= E_RVVM8QImode
;
930 /* A memcpy libcall in the worst case takes 3 instructions to prepare the
931 arguments + 1 for the call. When RVV should take 7 instructions and
932 we're optimizing for size a libcall may be preferable. */
933 if (size_p
&& need_loop
)
936 /* length_rtx holds the (remaining) length of the required copy.
937 cnt holds the length we copy with the current load/store pair. */
938 rtx cnt
= length_rtx
;
939 rtx label
= NULL_RTX
;
940 rtx dst_addr
= copy_addr_to_reg (XEXP (dst_in
, 0));
941 rtx src_addr
= copy_addr_to_reg (XEXP (src_in
, 0));
945 length_rtx
= copy_to_mode_reg (Pmode
, length_rtx
);
946 cnt
= gen_reg_rtx (Pmode
);
947 label
= gen_label_rtx ();
950 emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (vmode
, cnt
,
954 vec
= gen_reg_rtx (vmode
);
955 src
= change_address (src_in
, vmode
, src_addr
);
956 dst
= change_address (dst_in
, vmode
, dst_addr
);
958 /* If we don't need a loop and have a suitable mode to describe the size,
959 just do a load / store pair and leave it up to the later lazy code
960 motion pass to insert the appropriate vsetvli. */
961 if (!need_loop
&& known_eq (GET_MODE_SIZE (vmode
), INTVAL (length_in
)))
963 emit_move_insn (vec
, src
);
964 emit_move_insn (dst
, vec
);
968 machine_mode mask_mode
= riscv_vector::get_vector_mode
969 (BImode
, GET_MODE_NUNITS (vmode
)).require ();
970 rtx mask
= CONSTM1_RTX (mask_mode
);
971 if (!satisfies_constraint_K (cnt
))
972 cnt
= force_reg (Pmode
, cnt
);
973 rtx m_ops
[] = {vec
, mask
, src
};
974 emit_nonvlmax_insn (code_for_pred_mov (vmode
),
975 riscv_vector::UNARY_OP_TAMA
, m_ops
, cnt
);
976 emit_insn (gen_pred_store (vmode
, dst
, mask
, vec
, cnt
,
977 get_avl_type_rtx (riscv_vector::NONVLMAX
)));
982 emit_insn (gen_rtx_SET (src_addr
, gen_rtx_PLUS (Pmode
, src_addr
, cnt
)));
983 emit_insn (gen_rtx_SET (dst_addr
, gen_rtx_PLUS (Pmode
, dst_addr
, cnt
)));
984 emit_insn (gen_rtx_SET (length_rtx
, gen_rtx_MINUS (Pmode
, length_rtx
, cnt
)));
986 /* Emit the loop condition. */
987 rtx test
= gen_rtx_NE (VOIDmode
, end
, const0_rtx
);
988 emit_jump_insn (gen_cbranch4 (Pmode
, test
, length_rtx
, const0_rtx
, label
));
989 emit_insn (gen_nop ());
996 /* Implement rawmemchr<mode> and strlen using vector instructions.
997 It can be assumed that the needle is in the haystack, otherwise the
998 behavior is undefined. */
1001 expand_rawmemchr (machine_mode mode
, rtx dst
, rtx haystack
, rtx needle
,
1007 vsetvli a1, zero, e[8,16,32,64], m1, ta, ma
1008 vle[8,16,32,64]ff.v v8, (a0) # Load.
1009 csrr a1, vl # Get number of bytes read.
1010 vmseq.vx v0, v8, pat # v0 = (v8 == {pat, pat, ...})
1011 vfirst.m a2, v0 # Find first hit.
1012 add a0, a0, a1 # Bump pointer.
1013 bltz a2, loop # Not found?
1015 sub a0, a0, a1 # Go back by a1.
1016 shll a2, a2, [0,1,2,3] # Shift to get byte offset.
1017 add a0, a0, a2 # Add the offset.
1021 gcc_assert (TARGET_VECTOR
);
1024 gcc_assert (mode
== E_QImode
);
1026 unsigned int isize
= GET_MODE_SIZE (mode
).to_constant ();
1027 int lmul
= TARGET_MAX_LMUL
;
1028 poly_int64 nunits
= exact_div (BYTES_PER_RISCV_VECTOR
* lmul
, isize
);
1031 if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode
),
1032 nunits
).exists (&vmode
))
1035 machine_mode mask_mode
= riscv_vector::get_mask_mode (vmode
);
1037 rtx cnt
= gen_reg_rtx (Pmode
);
1038 emit_move_insn (cnt
, CONST0_RTX (Pmode
));
1040 rtx end
= gen_reg_rtx (Pmode
);
1041 rtx vec
= gen_reg_rtx (vmode
);
1042 rtx mask
= gen_reg_rtx (mask_mode
);
1044 /* After finding the first vector element matching the needle, we
1045 need to multiply by the vector element width (SEW) in order to
1046 return a pointer to the matching byte. */
1047 unsigned int shift
= exact_log2 (GET_MODE_SIZE (mode
).to_constant ());
1049 rtx src_addr
= copy_addr_to_reg (XEXP (haystack
, 0));
1050 rtx start_addr
= copy_addr_to_reg (XEXP (haystack
, 0));
1052 rtx loop
= gen_label_rtx ();
1055 rtx vsrc
= change_address (haystack
, vmode
, src_addr
);
1057 /* Bump the pointer. */
1058 rtx step
= gen_reg_rtx (Pmode
);
1059 emit_insn (gen_rtx_SET (step
, gen_rtx_ASHIFT (Pmode
, cnt
, GEN_INT (shift
))));
1060 emit_insn (gen_rtx_SET (src_addr
, gen_rtx_PLUS (Pmode
, src_addr
, step
)));
1062 /* Emit a first-fault load. */
1063 rtx vlops
[] = {vec
, vsrc
};
1064 emit_vlmax_insn (code_for_pred_fault_load (vmode
),
1065 riscv_vector::UNARY_OP
, vlops
);
1067 /* Read how far we read. */
1068 if (Pmode
== SImode
)
1069 emit_insn (gen_read_vlsi (cnt
));
1071 emit_insn (gen_read_vldi_zero_extend (cnt
));
1073 /* Compare needle with haystack and store in a mask. */
1074 rtx eq
= gen_rtx_EQ (mask_mode
, gen_const_vec_duplicate (vmode
, needle
), vec
);
1075 rtx vmsops
[] = {mask
, eq
, vec
, needle
};
1076 emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode
),
1077 riscv_vector::COMPARE_OP
, vmsops
, cnt
);
1079 /* Find the first bit in the mask. */
1080 rtx vfops
[] = {end
, mask
};
1081 emit_nonvlmax_insn (code_for_pred_ffs (mask_mode
, Pmode
),
1082 riscv_vector::CPOP_OP
, vfops
, cnt
);
1084 /* Emit the loop condition. */
1085 rtx test
= gen_rtx_LT (VOIDmode
, end
, const0_rtx
);
1086 emit_jump_insn (gen_cbranch4 (Pmode
, test
, end
, const0_rtx
, loop
));
1090 /* For strlen, return the length. */
1091 emit_insn (gen_rtx_SET (dst
, gen_rtx_PLUS (Pmode
, src_addr
, end
)));
1092 emit_insn (gen_rtx_SET (dst
, gen_rtx_MINUS (Pmode
, dst
, start_addr
)));
1096 /* For rawmemchr, return the position at SRC + END * [1,2,4,8]. */
1097 emit_insn (gen_rtx_SET (end
, gen_rtx_ASHIFT (Pmode
, end
, GEN_INT (shift
))));
1098 emit_insn (gen_rtx_SET (dst
, gen_rtx_PLUS (Pmode
, src_addr
, end
)));
1102 /* Implement cmpstr<mode> using vector instructions. The ALIGNMENT and
1103 NCOMPARE parameters are unused for now. */
1106 expand_strcmp (rtx result
, rtx src1
, rtx src2
, rtx nbytes
,
1107 unsigned HOST_WIDE_INT
, bool)
1109 gcc_assert (TARGET_VECTOR
);
1111 /* We don't support big endian. */
1112 if (BYTES_BIG_ENDIAN
)
1115 bool with_length
= nbytes
!= NULL_RTX
;
1118 && (!REG_P (nbytes
) && !SUBREG_P (nbytes
) && !CONST_INT_P (nbytes
)))
1121 if (with_length
&& CONST_INT_P (nbytes
))
1122 nbytes
= force_reg (Pmode
, nbytes
);
1124 machine_mode mode
= E_QImode
;
1125 unsigned int isize
= GET_MODE_SIZE (mode
).to_constant ();
1126 int lmul
= TARGET_MAX_LMUL
;
1127 poly_int64 nunits
= exact_div (BYTES_PER_RISCV_VECTOR
* lmul
, isize
);
1130 if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode
), nunits
)
1134 machine_mode mask_mode
= riscv_vector::get_mask_mode (vmode
);
1136 /* Prepare addresses. */
1137 rtx src_addr1
= copy_addr_to_reg (XEXP (src1
, 0));
1138 rtx vsrc1
= change_address (src1
, vmode
, src_addr1
);
1140 rtx src_addr2
= copy_addr_to_reg (XEXP (src2
, 0));
1141 rtx vsrc2
= change_address (src2
, vmode
, src_addr2
);
1143 /* Set initial pointer bump to 0. */
1144 rtx cnt
= gen_reg_rtx (Pmode
);
1145 emit_move_insn (cnt
, CONST0_RTX (Pmode
));
1147 rtx sub
= gen_reg_rtx (Pmode
);
1148 emit_move_insn (sub
, CONST0_RTX (Pmode
));
1150 /* Create source vectors. */
1151 rtx vec1
= gen_reg_rtx (vmode
);
1152 rtx vec2
= gen_reg_rtx (vmode
);
1154 rtx done
= gen_label_rtx ();
1155 rtx loop
= gen_label_rtx ();
1158 /* Bump the pointers. */
1159 emit_insn (gen_rtx_SET (src_addr1
, gen_rtx_PLUS (Pmode
, src_addr1
, cnt
)));
1160 emit_insn (gen_rtx_SET (src_addr2
, gen_rtx_PLUS (Pmode
, src_addr2
, cnt
)));
1162 rtx vlops1
[] = {vec1
, vsrc1
};
1163 rtx vlops2
[] = {vec2
, vsrc2
};
1167 emit_vlmax_insn (code_for_pred_fault_load (vmode
),
1168 riscv_vector::UNARY_OP
, vlops1
);
1170 emit_vlmax_insn (code_for_pred_fault_load (vmode
),
1171 riscv_vector::UNARY_OP
, vlops2
);
1175 nbytes
= gen_lowpart (Pmode
, nbytes
);
1176 emit_nonvlmax_insn (code_for_pred_fault_load (vmode
),
1177 riscv_vector::UNARY_OP
, vlops1
, nbytes
);
1179 emit_nonvlmax_insn (code_for_pred_fault_load (vmode
),
1180 riscv_vector::UNARY_OP
, vlops2
, nbytes
);
1183 /* Read the vl for the next pointer bump. */
1184 if (Pmode
== SImode
)
1185 emit_insn (gen_read_vlsi (cnt
));
1187 emit_insn (gen_read_vldi_zero_extend (cnt
));
1191 rtx test_done
= gen_rtx_EQ (VOIDmode
, cnt
, const0_rtx
);
1192 emit_jump_insn (gen_cbranch4 (Pmode
, test_done
, cnt
, const0_rtx
, done
));
1193 emit_insn (gen_rtx_SET (nbytes
, gen_rtx_MINUS (Pmode
, nbytes
, cnt
)));
1196 /* Look for a \0 in the first string. */
1197 rtx mask0
= gen_reg_rtx (mask_mode
);
1199 = gen_rtx_EQ (mask_mode
, gen_const_vec_duplicate (vmode
, CONST0_RTX (mode
)),
1201 rtx vmsops1
[] = {mask0
, eq0
, vec1
, CONST0_RTX (mode
)};
1202 emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode
),
1203 riscv_vector::COMPARE_OP
, vmsops1
, cnt
);
1205 /* Look for vec1 != vec2 (includes vec2[i] == 0). */
1206 rtx maskne
= gen_reg_rtx (mask_mode
);
1207 rtx ne
= gen_rtx_NE (mask_mode
, vec1
, vec2
);
1208 rtx vmsops
[] = {maskne
, ne
, vec1
, vec2
};
1209 emit_nonvlmax_insn (code_for_pred_cmp (vmode
), riscv_vector::COMPARE_OP
,
1212 /* Combine both masks into one. */
1213 rtx mask
= gen_reg_rtx (mask_mode
);
1214 rtx vmorops
[] = {mask
, mask0
, maskne
};
1215 emit_nonvlmax_insn (code_for_pred (IOR
, mask_mode
),
1216 riscv_vector::BINARY_MASK_OP
, vmorops
, cnt
);
1218 /* Find the first bit in the mask (the first unequal element). */
1219 rtx found_at
= gen_reg_rtx (Pmode
);
1220 rtx vfops
[] = {found_at
, mask
};
1221 emit_nonvlmax_insn (code_for_pred_ffs (mask_mode
, Pmode
),
1222 riscv_vector::CPOP_OP
, vfops
, cnt
);
1224 /* Emit the loop condition. */
1225 rtx test
= gen_rtx_LT (VOIDmode
, found_at
, const0_rtx
);
1226 emit_jump_insn (gen_cbranch4 (Pmode
, test
, found_at
, const0_rtx
, loop
));
1228 /* Walk up to the difference point. */
1230 gen_rtx_SET (src_addr1
, gen_rtx_PLUS (Pmode
, src_addr1
, found_at
)));
1232 gen_rtx_SET (src_addr2
, gen_rtx_PLUS (Pmode
, src_addr2
, found_at
)));
1234 /* Load the respective byte and compute the difference. */
1235 rtx c1
= gen_reg_rtx (Pmode
);
1236 rtx c2
= gen_reg_rtx (Pmode
);
1238 do_load_from_addr (mode
, c1
, src_addr1
, src1
);
1239 do_load_from_addr (mode
, c2
, src_addr2
, src2
);
1241 do_sub3 (sub
, c1
, c2
);
1246 emit_insn (gen_movsi (result
, gen_lowpart (SImode
, sub
)));