]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/config/riscv/riscv-string.cc
Update copyright years.
[thirdparty/gcc.git] / gcc / config / riscv / riscv-string.cc
1 /* Subroutines used to expand string operations for RISC-V.
2 Copyright (C) 2023-2024 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published
8 by the Free Software Foundation; either version 3, or (at your
9 option) any later version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
14 License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #define IN_TARGET_CODE 1
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "memmodel.h"
29 #include "tm_p.h"
30 #include "ira.h"
31 #include "print-tree.h"
32 #include "varasm.h"
33 #include "explow.h"
34 #include "expr.h"
35 #include "output.h"
36 #include "target.h"
37 #include "predict.h"
38 #include "optabs.h"
39 #include "riscv-protos.h"
40 #include "recog.h"
41 #include "tm-constrs.h"
42
43 /* Emit proper instruction depending on mode of dest. */
44
45 #define GEN_EMIT_HELPER2(name) \
46 static rtx_insn * \
47 do_## name ## 2(rtx dest, rtx src) \
48 { \
49 rtx_insn *insn; \
50 if (GET_MODE (dest) == DImode) \
51 insn = emit_insn (gen_ ## name ## di2 (dest, src)); \
52 else \
53 insn = emit_insn (gen_ ## name ## si2 (dest, src)); \
54 return insn; \
55 }
56
57 /* Emit proper instruction depending on mode of dest. */
58
59 #define GEN_EMIT_HELPER3(name) \
60 static rtx_insn * \
61 do_## name ## 3(rtx dest, rtx src1, rtx src2) \
62 { \
63 rtx_insn *insn; \
64 if (GET_MODE (dest) == DImode) \
65 insn = emit_insn (gen_ ## name ## di3 (dest, src1, src2)); \
66 else \
67 insn = emit_insn (gen_ ## name ## si3 (dest, src1, src2)); \
68 return insn; \
69 }
70
71 GEN_EMIT_HELPER3(add) /* do_add3 */
72 GEN_EMIT_HELPER3(and) /* do_and3 */
73 GEN_EMIT_HELPER3(ashl) /* do_ashl3 */
74 GEN_EMIT_HELPER2(bswap) /* do_bswap2 */
75 GEN_EMIT_HELPER2(clz) /* do_clz2 */
76 GEN_EMIT_HELPER2(ctz) /* do_ctz2 */
77 GEN_EMIT_HELPER3(ior) /* do_ior3 */
78 GEN_EMIT_HELPER3(ior_not) /* do_ior_not3 */
79 GEN_EMIT_HELPER3(lshr) /* do_lshr3 */
80 GEN_EMIT_HELPER2(neg) /* do_neg2 */
81 GEN_EMIT_HELPER2(orcb) /* do_orcb2 */
82 GEN_EMIT_HELPER2(one_cmpl) /* do_one_cmpl2 */
83 GEN_EMIT_HELPER3(rotr) /* do_rotr3 */
84 GEN_EMIT_HELPER3(sub) /* do_sub3 */
85 GEN_EMIT_HELPER2(th_rev) /* do_th_rev2 */
86 GEN_EMIT_HELPER2(th_tstnbz) /* do_th_tstnbz2 */
87 GEN_EMIT_HELPER3(xor) /* do_xor3 */
88 GEN_EMIT_HELPER2(zero_extendqi) /* do_zero_extendqi2 */
89
90 #undef GEN_EMIT_HELPER2
91 #undef GEN_EMIT_HELPER3
92
93 /* Helper function to load a byte or a Pmode register.
94
95 MODE is the mode to use for the load (QImode or Pmode).
96 DEST is the destination register for the data.
97 ADDR_REG is the register that holds the address.
98 ADDR is the address expression to load from.
99
100 This function returns an rtx containing the register,
101 where the ADDR is stored. */
102
103 static rtx
104 do_load_from_addr (machine_mode mode, rtx dest, rtx addr_reg, rtx addr)
105 {
106 rtx mem = gen_rtx_MEM (mode, addr_reg);
107 MEM_COPY_ATTRIBUTES (mem, addr);
108 set_mem_size (mem, GET_MODE_SIZE (mode));
109
110 if (mode == QImode)
111 do_zero_extendqi2 (dest, mem);
112 else if (mode == Xmode)
113 emit_move_insn (dest, mem);
114 else
115 gcc_unreachable ();
116
117 return addr_reg;
118 }
119
120 /* Generate a sequence to compare single characters in data1 and data2.
121
122 RESULT is the register where the return value of str(n)cmp will be stored.
123 DATA1 is a register which contains character1.
124 DATA2 is a register which contains character2.
125 FINAL_LABEL is the location after the calculation of the return value. */
126
127 static void
128 emit_strcmp_scalar_compare_byte (rtx result, rtx data1, rtx data2,
129 rtx final_label)
130 {
131 rtx tmp = gen_reg_rtx (Xmode);
132 do_sub3 (tmp, data1, data2);
133 emit_insn (gen_movsi (result, gen_lowpart (SImode, tmp)));
134 emit_jump_insn (gen_jump (final_label));
135 emit_barrier (); /* No fall-through. */
136 }
137
138 /* Generate a sequence to compare two strings in data1 and data2.
139
140 DATA1 is a register which contains string1.
141 DATA2 is a register which contains string2.
142 ORC1 is a register where orc.b(data1) will be stored.
143 CMP_BYTES is the length of the strings.
144 END_LABEL is the location of the code that calculates the return value. */
145
146 static void
147 emit_strcmp_scalar_compare_subword (rtx data1, rtx data2, rtx orc1,
148 unsigned HOST_WIDE_INT cmp_bytes,
149 rtx end_label)
150 {
151 /* Set a NUL-byte after the relevant data (behind the string). */
152 long long im = -256ll;
153 rtx imask = gen_rtx_CONST_INT (Xmode, im);
154 rtx m_reg = gen_reg_rtx (Xmode);
155 emit_insn (gen_rtx_SET (m_reg, imask));
156 do_rotr3 (m_reg, m_reg, GEN_INT (64 - cmp_bytes * BITS_PER_UNIT));
157 do_and3 (data1, m_reg, data1);
158 do_and3 (data2, m_reg, data2);
159 if (TARGET_ZBB)
160 do_orcb2 (orc1, data1);
161 else
162 do_th_tstnbz2 (orc1, data1);
163 emit_jump_insn (gen_jump (end_label));
164 emit_barrier (); /* No fall-through. */
165 }
166
167 /* Generate a sequence to compare two strings in data1 and data2.
168
169 DATA1 is a register which contains string1.
170 DATA2 is a register which contains string2.
171 ORC1 is a register where orc.b(data1) will be stored.
172 TESTVAL is the value to test ORC1 against.
173 END_LABEL is the location of the code that calculates the return value.
174 NONUL_END_LABEL is the location of the code that calculates the return value
175 in case the first string does not contain a NULL-byte. */
176
177 static void
178 emit_strcmp_scalar_compare_word (rtx data1, rtx data2, rtx orc1, rtx testval,
179 rtx end_label, rtx nonul_end_label)
180 {
181 /* Check if data1 contains a NUL character. */
182 if (TARGET_ZBB)
183 do_orcb2 (orc1, data1);
184 else
185 do_th_tstnbz2 (orc1, data1);
186 rtx cond1 = gen_rtx_NE (VOIDmode, orc1, testval);
187 emit_unlikely_jump_insn (gen_cbranch4 (Pmode, cond1, orc1, testval,
188 end_label));
189 /* Break out if u1 != u2 */
190 rtx cond2 = gen_rtx_NE (VOIDmode, data1, data2);
191 emit_unlikely_jump_insn (gen_cbranch4 (Pmode, cond2, data1,
192 data2, nonul_end_label));
193 /* Fall-through on equality. */
194 }
195
196 /* Generate the sequence of compares for strcmp/strncmp using zbb instructions.
197
198 RESULT is the register where the return value of str(n)cmp will be stored.
199 The strings are referenced by SRC1 and SRC2.
200 The number of bytes to compare is defined by NBYTES.
201 DATA1 is a register where string1 will be stored.
202 DATA2 is a register where string2 will be stored.
203 ORC1 is a register where orc.b(data1) will be stored.
204 END_LABEL is the location of the code that calculates the return value.
205 NONUL_END_LABEL is the location of the code that calculates the return value
206 in case the first string does not contain a NULL-byte.
207 FINAL_LABEL is the location of the code that comes after the calculation
208 of the return value. */
209
210 static void
211 emit_strcmp_scalar_load_and_compare (rtx result, rtx src1, rtx src2,
212 unsigned HOST_WIDE_INT nbytes,
213 rtx data1, rtx data2, rtx orc1,
214 rtx end_label, rtx nonul_end_label,
215 rtx final_label)
216 {
217 const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
218 rtx src1_addr = force_reg (Pmode, XEXP (src1, 0));
219 rtx src2_addr = force_reg (Pmode, XEXP (src2, 0));
220 unsigned HOST_WIDE_INT offset = 0;
221
222 rtx testval = gen_reg_rtx (Xmode);
223 if (TARGET_ZBB)
224 emit_insn (gen_rtx_SET (testval, constm1_rtx));
225 else
226 emit_insn (gen_rtx_SET (testval, const0_rtx));
227
228 while (nbytes > 0)
229 {
230 unsigned HOST_WIDE_INT cmp_bytes = xlen < nbytes ? xlen : nbytes;
231 machine_mode load_mode;
232 if (cmp_bytes == 1)
233 load_mode = QImode;
234 else
235 load_mode = Xmode;
236
237 rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset));
238 do_load_from_addr (load_mode, data1, addr1, src1);
239 rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset));
240 do_load_from_addr (load_mode, data2, addr2, src2);
241
242 if (cmp_bytes == 1)
243 {
244 emit_strcmp_scalar_compare_byte (result, data1, data2, final_label);
245 return;
246 }
247 else if (cmp_bytes < xlen)
248 {
249 emit_strcmp_scalar_compare_subword (data1, data2, orc1,
250 cmp_bytes, end_label);
251 return;
252 }
253 else
254 emit_strcmp_scalar_compare_word (data1, data2, orc1, testval,
255 end_label, nonul_end_label);
256
257 offset += cmp_bytes;
258 nbytes -= cmp_bytes;
259 }
260 }
261
262 /* Fixup pointers and generate a call to strcmp.
263
264 RESULT is the register where the return value of str(n)cmp will be stored.
265 The strings are referenced by SRC1 and SRC2.
266 The number of already compared bytes is defined by NBYTES. */
267
268 static void
269 emit_strcmp_scalar_call_to_libc (rtx result, rtx src1, rtx src2,
270 unsigned HOST_WIDE_INT nbytes)
271 {
272 /* Update pointers past what has been compared already. */
273 rtx src1_addr = force_reg (Pmode, XEXP (src1, 0));
274 rtx src2_addr = force_reg (Pmode, XEXP (src2, 0));
275 rtx src1_new = force_reg (Pmode,
276 gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (nbytes)));
277 rtx src2_new = force_reg (Pmode,
278 gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (nbytes)));
279
280 /* Construct call to strcmp to compare the rest of the string. */
281 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
282 emit_library_call_value (XEXP (DECL_RTL (fun), 0),
283 result, LCT_NORMAL, GET_MODE (result),
284 src1_new, Pmode, src2_new, Pmode);
285 }
286
287 /* Fast strcmp-result calculation if no NULL-byte in string1.
288
289 RESULT is the register where the return value of str(n)cmp will be stored.
290 The mismatching strings are stored in DATA1 and DATA2. */
291
292 static void
293 emit_strcmp_scalar_result_calculation_nonul (rtx result, rtx data1, rtx data2)
294 {
295 /* Words don't match, and no NUL byte in one word.
296 Get bytes in big-endian order and compare as words. */
297 do_bswap2 (data1, data1);
298 do_bswap2 (data2, data2);
299 /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
300 rtx tmp = gen_reg_rtx (Xmode);
301 emit_insn (gen_slt_3 (LTU, Xmode, Xmode, tmp, data1, data2));
302 do_neg2 (tmp, tmp);
303 do_ior3 (tmp, tmp, const1_rtx);
304 emit_insn (gen_movsi (result, gen_lowpart (SImode, tmp)));
305 }
306
307 /* strcmp-result calculation.
308
309 RESULT is the register where the return value of str(n)cmp will be stored.
310 The strings are stored in DATA1 and DATA2.
311 ORC1 contains orc.b(DATA1). */
312
313 static void
314 emit_strcmp_scalar_result_calculation (rtx result, rtx data1, rtx data2,
315 rtx orc1)
316 {
317 const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
318
319 /* Convert non-equal bytes into non-NUL bytes. */
320 rtx diff = gen_reg_rtx (Xmode);
321 do_xor3 (diff, data1, data2);
322 rtx shift = gen_reg_rtx (Xmode);
323
324 if (TARGET_ZBB)
325 {
326 /* Convert non-equal or NUL-bytes into non-NUL bytes. */
327 rtx syndrome = gen_reg_rtx (Xmode);
328 do_orcb2 (diff, diff);
329 do_ior_not3 (syndrome, orc1, diff);
330 /* Count the number of equal bits from the beginning of the word. */
331 do_ctz2 (shift, syndrome);
332 }
333 else
334 {
335 /* Convert non-equal or NUL-bytes into non-NUL bytes. */
336 rtx syndrome = gen_reg_rtx (Xmode);
337 do_th_tstnbz2 (diff, diff);
338 do_one_cmpl2 (diff, diff);
339 do_ior3 (syndrome, orc1, diff);
340 /* Count the number of equal bits from the beginning of the word. */
341 do_th_rev2 (syndrome, syndrome);
342 do_clz2 (shift, syndrome);
343 }
344
345 do_bswap2 (data1, data1);
346 do_bswap2 (data2, data2);
347
348 /* The most-significant-non-zero bit of the syndrome marks either the
349 first bit that is different, or the top bit of the first zero byte.
350 Shifting left now will bring the critical information into the
351 top bits. */
352 do_ashl3 (data1, data1, gen_lowpart (QImode, shift));
353 do_ashl3 (data2, data2, gen_lowpart (QImode, shift));
354
355 /* But we need to zero-extend (char is unsigned) the value and then
356 perform a signed 32-bit subtraction. */
357 unsigned int shiftr = (xlen - 1) * BITS_PER_UNIT;
358 do_lshr3 (data1, data1, GEN_INT (shiftr));
359 do_lshr3 (data2, data2, GEN_INT (shiftr));
360 rtx tmp = gen_reg_rtx (Xmode);
361 do_sub3 (tmp, data1, data2);
362 emit_insn (gen_movsi (result, gen_lowpart (SImode, tmp)));
363 }
364
365 /* Expand str(n)cmp using Zbb/TheadBb instructions.
366
367 The result will be stored in RESULT.
368 The strings are referenced by SRC1 and SRC2.
369 The number of bytes to compare is defined by NBYTES.
370 The alignment is defined by ALIGNMENT.
371 If NCOMPARE is false then libc's strcmp() will be called if comparing
372 NBYTES of both strings did not find differences or NULL-bytes.
373
374 Return true if expansion was successful, or false otherwise. */
375
376 static bool
377 riscv_expand_strcmp_scalar (rtx result, rtx src1, rtx src2,
378 unsigned HOST_WIDE_INT nbytes,
379 unsigned HOST_WIDE_INT alignment,
380 bool ncompare)
381 {
382 const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
383
384 gcc_assert (TARGET_ZBB || TARGET_XTHEADBB);
385 gcc_assert (nbytes > 0);
386 gcc_assert ((int)nbytes <= riscv_strcmp_inline_limit);
387 gcc_assert (ncompare || (nbytes & (xlen - 1)) == 0);
388
389 /* Limit to 12-bits (maximum load-offset). */
390 if (nbytes > IMM_REACH)
391 nbytes = IMM_REACH;
392
393 /* We don't support big endian. */
394 if (BYTES_BIG_ENDIAN)
395 return false;
396
397 /* We need xlen-aligned strings. */
398 if (alignment < xlen)
399 return false;
400
401 /* Overall structure of emitted code:
402 Load-and-compare:
403 - Load data1 and data2
404 - Set orc1 := orc.b (data1) (or th.tstnbz)
405 - Compare strings and either:
406 - Fall-through on equality
407 - Jump to nonul_end_label if data1 !or end_label
408 - Calculate result value and jump to final_label
409 // Fall-through
410 Call-to-libc or set result to 0 (depending on ncompare)
411 Jump to final_label
412 nonul_end_label: // words don't match, and no null byte in first word.
413 Calculate result value with the use of data1, data2 and orc1
414 Jump to final_label
415 end_label:
416 Calculate result value with the use of data1, data2 and orc1
417 Jump to final_label
418 final_label:
419 // Nothing. */
420
421 rtx data1 = gen_reg_rtx (Xmode);
422 rtx data2 = gen_reg_rtx (Xmode);
423 rtx orc1 = gen_reg_rtx (Xmode);
424 rtx nonul_end_label = gen_label_rtx ();
425 rtx end_label = gen_label_rtx ();
426 rtx final_label = gen_label_rtx ();
427
428 /* Generate a sequence of zbb instructions to compare out
429 to the length specified. */
430 emit_strcmp_scalar_load_and_compare (result, src1, src2, nbytes,
431 data1, data2, orc1,
432 end_label, nonul_end_label, final_label);
433
434 /* All compared and everything was equal. */
435 if (ncompare)
436 {
437 emit_insn (gen_rtx_SET (result, gen_rtx_CONST_INT (SImode, 0)));
438 emit_jump_insn (gen_jump (final_label));
439 emit_barrier (); /* No fall-through. */
440 }
441 else
442 {
443 emit_strcmp_scalar_call_to_libc (result, src1, src2, nbytes);
444 emit_jump_insn (gen_jump (final_label));
445 emit_barrier (); /* No fall-through. */
446 }
447
448
449 emit_label (nonul_end_label);
450 emit_strcmp_scalar_result_calculation_nonul (result, data1, data2);
451 emit_jump_insn (gen_jump (final_label));
452 emit_barrier (); /* No fall-through. */
453
454 emit_label (end_label);
455 emit_strcmp_scalar_result_calculation (result, data1, data2, orc1);
456 emit_jump_insn (gen_jump (final_label));
457 emit_barrier (); /* No fall-through. */
458
459 emit_label (final_label);
460 return true;
461 }
462
463 /* Expand a string compare operation.
464
465 The result will be stored in RESULT.
466 The strings are referenced by SRC1 and SRC2.
467 The argument BYTES_RTX either holds the number of characters to
468 compare, or is NULL_RTX. The argument ALIGN_RTX holds the alignment.
469
470 Return true if expansion was successful, or false otherwise. */
471
472 bool
473 riscv_expand_strcmp (rtx result, rtx src1, rtx src2,
474 rtx bytes_rtx, rtx align_rtx)
475 {
476 unsigned HOST_WIDE_INT compare_max;
477 unsigned HOST_WIDE_INT nbytes;
478 unsigned HOST_WIDE_INT alignment;
479 bool ncompare = bytes_rtx != NULL_RTX;
480 const unsigned HOST_WIDE_INT xlen = GET_MODE_SIZE (Xmode);
481
482 if (riscv_strcmp_inline_limit == 0)
483 return false;
484
485 /* Round down the comparision limit to a multiple of xlen. */
486 compare_max = riscv_strcmp_inline_limit & ~(xlen - 1);
487
488 /* Decide how many bytes to compare inline. */
489 if (bytes_rtx == NULL_RTX)
490 {
491 nbytes = compare_max;
492 }
493 else
494 {
495 /* If we have a length, it must be constant. */
496 if (!CONST_INT_P (bytes_rtx))
497 return false;
498 nbytes = UINTVAL (bytes_rtx);
499
500 /* We don't emit parts of a strncmp() call. */
501 if (nbytes > compare_max)
502 return false;
503 }
504
505 /* Guarantees:
506 - nbytes > 0
507 - nbytes <= riscv_strcmp_inline_limit
508 - nbytes is a multiple of xlen if !ncompare */
509
510 if (!CONST_INT_P (align_rtx))
511 return false;
512 alignment = UINTVAL (align_rtx);
513
514 if (TARGET_VECTOR && stringop_strategy & STRATEGY_VECTOR)
515 {
516 bool ok = riscv_vector::expand_strcmp (result, src1, src2,
517 bytes_rtx, alignment,
518 ncompare);
519 if (ok)
520 return true;
521 }
522
523 if ((TARGET_ZBB || TARGET_XTHEADBB) && stringop_strategy & STRATEGY_SCALAR)
524 return riscv_expand_strcmp_scalar (result, src1, src2, nbytes, alignment,
525 ncompare);
526
527 return false;
528 }
529
530 /* If the provided string is aligned, then read XLEN bytes
531 in a loop and use orc.b to find NUL-bytes. */
532
533 static bool
534 riscv_expand_strlen_scalar (rtx result, rtx src, rtx align)
535 {
536 rtx testval, addr, addr_plus_regsz, word, zeros;
537 rtx loop_label, cond;
538
539 gcc_assert (TARGET_ZBB || TARGET_XTHEADBB);
540
541 /* The alignment needs to be known and big enough. */
542 if (!CONST_INT_P (align) || UINTVAL (align) < GET_MODE_SIZE (Xmode))
543 return false;
544
545 testval = gen_reg_rtx (Xmode);
546 addr = copy_addr_to_reg (XEXP (src, 0));
547 addr_plus_regsz = gen_reg_rtx (Pmode);
548 word = gen_reg_rtx (Xmode);
549 zeros = gen_reg_rtx (Xmode);
550
551 if (TARGET_ZBB)
552 emit_insn (gen_rtx_SET (testval, constm1_rtx));
553 else
554 emit_insn (gen_rtx_SET (testval, const0_rtx));
555
556 do_add3 (addr_plus_regsz, addr, GEN_INT (UNITS_PER_WORD));
557
558 loop_label = gen_label_rtx ();
559 emit_label (loop_label);
560
561 /* Load a word and use orc.b/th.tstnbz to find a zero-byte. */
562 do_load_from_addr (Xmode, word, addr, src);
563 do_add3 (addr, addr, GEN_INT (UNITS_PER_WORD));
564 if (TARGET_ZBB)
565 do_orcb2 (word, word);
566 else
567 do_th_tstnbz2 (word, word);
568 cond = gen_rtx_EQ (VOIDmode, word, testval);
569 emit_unlikely_jump_insn (gen_cbranch4 (Xmode, cond, word, testval, loop_label));
570
571 /* Calculate the return value by counting zero-bits. */
572 if (TARGET_ZBB)
573 do_one_cmpl2 (word, word);
574 if (TARGET_BIG_ENDIAN)
575 do_clz2 (zeros, word);
576 else if (TARGET_ZBB)
577 do_ctz2 (zeros, word);
578 else
579 {
580 do_th_rev2 (word, word);
581 do_clz2 (zeros, word);
582 }
583
584 do_lshr3 (zeros, zeros, GEN_INT (exact_log2 (BITS_PER_UNIT)));
585 do_add3 (addr, addr, zeros);
586 do_sub3 (result, addr, addr_plus_regsz);
587
588 return true;
589 }
590
591 /* Expand a strlen operation and return true if successful.
592 Return false if we should let the compiler generate normal
593 code, probably a strlen call. */
594
595 bool
596 riscv_expand_strlen (rtx result, rtx src, rtx search_char, rtx align)
597 {
598 if (TARGET_VECTOR && stringop_strategy & STRATEGY_VECTOR)
599 {
600 riscv_vector::expand_rawmemchr (E_QImode, result, src, search_char,
601 /* strlen */ true);
602 return true;
603 }
604
605 gcc_assert (search_char == const0_rtx);
606
607 if ((TARGET_ZBB || TARGET_XTHEADBB) && stringop_strategy & STRATEGY_SCALAR)
608 return riscv_expand_strlen_scalar (result, src, align);
609
610 return false;
611 }
612
613 /* Emit straight-line code to move LENGTH bytes from SRC to DEST.
614 Assume that the areas do not overlap. */
615
616 static void
617 riscv_block_move_straight (rtx dest, rtx src, unsigned HOST_WIDE_INT length)
618 {
619 unsigned HOST_WIDE_INT offset, delta;
620 unsigned HOST_WIDE_INT bits;
621 int i;
622 enum machine_mode mode;
623 rtx *regs;
624
625 bits = MAX (BITS_PER_UNIT,
626 MIN (BITS_PER_WORD, MIN (MEM_ALIGN (src), MEM_ALIGN (dest))));
627
628 mode = mode_for_size (bits, MODE_INT, 0).require ();
629 delta = bits / BITS_PER_UNIT;
630
631 /* Allocate a buffer for the temporary registers. */
632 regs = XALLOCAVEC (rtx, length / delta);
633
634 /* Load as many BITS-sized chunks as possible. Use a normal load if
635 the source has enough alignment, otherwise use left/right pairs. */
636 for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
637 {
638 regs[i] = gen_reg_rtx (mode);
639 riscv_emit_move (regs[i], adjust_address (src, mode, offset));
640 }
641
642 /* Copy the chunks to the destination. */
643 for (offset = 0, i = 0; offset + delta <= length; offset += delta, i++)
644 riscv_emit_move (adjust_address (dest, mode, offset), regs[i]);
645
646 /* Mop up any left-over bytes. */
647 if (offset < length)
648 {
649 src = adjust_address (src, BLKmode, offset);
650 dest = adjust_address (dest, BLKmode, offset);
651 move_by_pieces (dest, src, length - offset,
652 MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), RETURN_BEGIN);
653 }
654 }
655
656 /* Helper function for doing a loop-based block operation on memory
657 reference MEM. Each iteration of the loop will operate on LENGTH
658 bytes of MEM.
659
660 Create a new base register for use within the loop and point it to
661 the start of MEM. Create a new memory reference that uses this
662 register. Store them in *LOOP_REG and *LOOP_MEM respectively. */
663
664 static void
665 riscv_adjust_block_mem (rtx mem, unsigned HOST_WIDE_INT length,
666 rtx *loop_reg, rtx *loop_mem)
667 {
668 *loop_reg = copy_addr_to_reg (XEXP (mem, 0));
669
670 /* Although the new mem does not refer to a known location,
671 it does keep up to LENGTH bytes of alignment. */
672 *loop_mem = change_address (mem, BLKmode, *loop_reg);
673 set_mem_align (*loop_mem, MIN (MEM_ALIGN (mem), length * BITS_PER_UNIT));
674 }
675
676 /* Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
677 bytes at a time. LENGTH must be at least BYTES_PER_ITER. Assume that
678 the memory regions do not overlap. */
679
680 static void
681 riscv_block_move_loop (rtx dest, rtx src, unsigned HOST_WIDE_INT length,
682 unsigned HOST_WIDE_INT bytes_per_iter)
683 {
684 rtx label, src_reg, dest_reg, final_src, test;
685 unsigned HOST_WIDE_INT leftover;
686
687 leftover = length % bytes_per_iter;
688 length -= leftover;
689
690 /* Create registers and memory references for use within the loop. */
691 riscv_adjust_block_mem (src, bytes_per_iter, &src_reg, &src);
692 riscv_adjust_block_mem (dest, bytes_per_iter, &dest_reg, &dest);
693
694 /* Calculate the value that SRC_REG should have after the last iteration
695 of the loop. */
696 final_src = expand_simple_binop (Pmode, PLUS, src_reg, GEN_INT (length),
697 0, 0, OPTAB_WIDEN);
698
699 /* Emit the start of the loop. */
700 label = gen_label_rtx ();
701 emit_label (label);
702
703 /* Emit the loop body. */
704 riscv_block_move_straight (dest, src, bytes_per_iter);
705
706 /* Move on to the next block. */
707 riscv_emit_move (src_reg, plus_constant (Pmode, src_reg, bytes_per_iter));
708 riscv_emit_move (dest_reg, plus_constant (Pmode, dest_reg, bytes_per_iter));
709
710 /* Emit the loop condition. */
711 test = gen_rtx_NE (VOIDmode, src_reg, final_src);
712 emit_jump_insn (gen_cbranch4 (Pmode, test, src_reg, final_src, label));
713
714 /* Mop up any left-over bytes. */
715 if (leftover)
716 riscv_block_move_straight (dest, src, leftover);
717 else
718 emit_insn(gen_nop ());
719 }
720
721 /* Expand a cpymemsi instruction, which copies LENGTH bytes from
722 memory reference SRC to memory reference DEST. */
723
724 static bool
725 riscv_expand_block_move_scalar (rtx dest, rtx src, rtx length)
726 {
727 if (!CONST_INT_P (length))
728 return false;
729
730 unsigned HOST_WIDE_INT hwi_length = UINTVAL (length);
731 unsigned HOST_WIDE_INT factor, align;
732
733 align = MIN (MIN (MEM_ALIGN (src), MEM_ALIGN (dest)), BITS_PER_WORD);
734 factor = BITS_PER_WORD / align;
735
736 if (optimize_function_for_size_p (cfun)
737 && hwi_length * factor * UNITS_PER_WORD > MOVE_RATIO (false))
738 return false;
739
740 if (hwi_length <= (RISCV_MAX_MOVE_BYTES_STRAIGHT / factor))
741 {
742 riscv_block_move_straight (dest, src, INTVAL (length));
743 return true;
744 }
745 else if (optimize && align >= BITS_PER_WORD)
746 {
747 unsigned min_iter_words
748 = RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER / UNITS_PER_WORD;
749 unsigned iter_words = min_iter_words;
750 unsigned HOST_WIDE_INT bytes = hwi_length;
751 unsigned HOST_WIDE_INT words = bytes / UNITS_PER_WORD;
752
753 /* Lengthen the loop body if it shortens the tail. */
754 for (unsigned i = min_iter_words; i < min_iter_words * 2 - 1; i++)
755 {
756 unsigned cur_cost = iter_words + words % iter_words;
757 unsigned new_cost = i + words % i;
758 if (new_cost <= cur_cost)
759 iter_words = i;
760 }
761
762 riscv_block_move_loop (dest, src, bytes, iter_words * UNITS_PER_WORD);
763 return true;
764 }
765
766 return false;
767 }
768
769 /* This function delegates block-move expansion to either the vector
770 implementation or the scalar one. Return TRUE if successful or FALSE
771 otherwise. */
772
773 bool
774 riscv_expand_block_move (rtx dest, rtx src, rtx length)
775 {
776 if (TARGET_VECTOR && stringop_strategy & STRATEGY_VECTOR)
777 {
778 bool ok = riscv_vector::expand_block_move (dest, src, length);
779 if (ok)
780 return true;
781 }
782
783 if (stringop_strategy & STRATEGY_SCALAR)
784 return riscv_expand_block_move_scalar (dest, src, length);
785
786 return false;
787 }
788
789 /* --- Vector expanders --- */
790
791 namespace riscv_vector {
792
793 /* Used by cpymemsi in riscv.md . */
794
795 bool
796 expand_block_move (rtx dst_in, rtx src_in, rtx length_in)
797 {
798 /*
799 memcpy:
800 mv a3, a0 # Copy destination
801 loop:
802 vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b
803 vle8.v v0, (a1) # Load bytes
804 add a1, a1, t0 # Bump pointer
805 sub a2, a2, t0 # Decrement count
806 vse8.v v0, (a3) # Store bytes
807 add a3, a3, t0 # Bump pointer
808 bnez a2, loop # Any more?
809 ret # Return
810 */
811 gcc_assert (TARGET_VECTOR);
812
813 HOST_WIDE_INT potential_ew
814 = (MIN (MIN (MEM_ALIGN (src_in), MEM_ALIGN (dst_in)), BITS_PER_WORD)
815 / BITS_PER_UNIT);
816 machine_mode vmode = VOIDmode;
817 bool need_loop = true;
818 bool size_p = optimize_function_for_size_p (cfun);
819 rtx src, dst;
820 rtx end = gen_reg_rtx (Pmode);
821 rtx vec;
822 rtx length_rtx = length_in;
823
824 if (CONST_INT_P (length_in))
825 {
826 HOST_WIDE_INT length = INTVAL (length_in);
827
828 /* By using LMUL=8, we can copy as many bytes in one go as there
829 are bits in a vector register. If the entire block thus fits,
830 we don't need a loop. */
831 if (length <= TARGET_MIN_VLEN)
832 {
833 need_loop = false;
834
835 /* If a single scalar load / store pair can do the job, leave it
836 to the scalar code to do that. */
837 /* ??? If fast unaligned access is supported, the scalar code could
838 use suitably sized scalars irrespective of alignemnt. If that
839 gets fixed, we have to adjust the test here. */
840
841 if (pow2p_hwi (length) && length <= potential_ew)
842 return false;
843 }
844
845 /* Find the vector mode to use. Using the largest possible element
846 size is likely to give smaller constants, and thus potentially
847 reducing code size. However, if we need a loop, we need to update
848 the pointers, and that is more complicated with a larger element
849 size, unless we use an immediate, which prevents us from dynamically
850 using the targets transfer size that the hart supports. And then,
851 unless we know the *exact* vector size of the hart, we'd need
852 multiple vsetvli / branch statements, so it's not even a size win.
853 If, in the future, we find an RISCV-V implementation that is slower
854 for small element widths, we might allow larger element widths for
855 loops too. */
856 if (need_loop)
857 potential_ew = 1;
858 for (; potential_ew; potential_ew >>= 1)
859 {
860 scalar_int_mode elem_mode;
861 unsigned HOST_WIDE_INT bits = potential_ew * BITS_PER_UNIT;
862 unsigned HOST_WIDE_INT per_iter;
863 HOST_WIDE_INT nunits;
864
865 if (need_loop)
866 per_iter = TARGET_MIN_VLEN;
867 else
868 per_iter = length;
869 nunits = per_iter / potential_ew;
870
871 /* Unless we get an implementation that's slow for small element
872 size / non-word-aligned accesses, we assume that the hardware
873 handles this well, and we don't want to complicate the code
874 with shifting word contents around or handling extra bytes at
875 the start and/or end. So we want the total transfer size and
876 alignment to fit with the element size. */
877 if (length % potential_ew != 0
878 || !int_mode_for_size (bits, 0).exists (&elem_mode))
879 continue;
880 /* Find the mode to use for the copy inside the loop - or the
881 sole copy, if there is no loop. */
882 if (!need_loop)
883 {
884 /* Try if we have an exact mode for the copy. */
885 if (riscv_vector::get_vector_mode (elem_mode,
886 nunits).exists (&vmode))
887 break;
888 /* Since we don't have a mode that exactlty matches the transfer
889 size, we'll need to use pred_store, which is not available
890 for all vector modes, but only iE_RVV_M* modes, hence trying
891 to find a vector mode for a merely rounded-up size is
892 pointless.
893 Still, by choosing a lower LMUL factor that still allows
894 an entire transfer, we can reduce register pressure. */
895 for (unsigned lmul = 1; lmul <= 4; lmul <<= 1)
896 if (TARGET_MIN_VLEN * lmul <= nunits * BITS_PER_UNIT
897 /* Avoid loosing the option of using vsetivli . */
898 && (nunits <= 31 * lmul || nunits > 31 * 8)
899 && multiple_p (BYTES_PER_RISCV_VECTOR * lmul, potential_ew)
900 && (riscv_vector::get_vector_mode
901 (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * lmul,
902 potential_ew)).exists (&vmode)))
903 break;
904 }
905
906 /* The RVVM8?I modes are notionally 8 * BYTES_PER_RISCV_VECTOR bytes
907 wide. BYTES_PER_RISCV_VECTOR can't be eavenly divided by
908 the sizes of larger element types; the LMUL factor of 8 can at
909 the moment be divided by the SEW, with SEW of up to 8 bytes,
910 but there are reserved encodings so there might be larger
911 SEW in the future. */
912 if (riscv_vector::get_vector_mode
913 (elem_mode, exact_div (BYTES_PER_RISCV_VECTOR * 8,
914 potential_ew)).exists (&vmode))
915 break;
916
917 /* We may get here if we tried an element size that's larger than
918 the hardware supports, but we should at least find a suitable
919 byte vector mode. */
920 gcc_assert (potential_ew > 1);
921 }
922 if (potential_ew > 1)
923 length_rtx = GEN_INT (length / potential_ew);
924 }
925 else
926 {
927 vmode = E_RVVM8QImode;
928 }
929
930 /* A memcpy libcall in the worst case takes 3 instructions to prepare the
931 arguments + 1 for the call. When RVV should take 7 instructions and
932 we're optimizing for size a libcall may be preferable. */
933 if (size_p && need_loop)
934 return false;
935
936 /* length_rtx holds the (remaining) length of the required copy.
937 cnt holds the length we copy with the current load/store pair. */
938 rtx cnt = length_rtx;
939 rtx label = NULL_RTX;
940 rtx dst_addr = copy_addr_to_reg (XEXP (dst_in, 0));
941 rtx src_addr = copy_addr_to_reg (XEXP (src_in, 0));
942
943 if (need_loop)
944 {
945 length_rtx = copy_to_mode_reg (Pmode, length_rtx);
946 cnt = gen_reg_rtx (Pmode);
947 label = gen_label_rtx ();
948
949 emit_label (label);
950 emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (vmode, cnt,
951 length_rtx));
952 }
953
954 vec = gen_reg_rtx (vmode);
955 src = change_address (src_in, vmode, src_addr);
956 dst = change_address (dst_in, vmode, dst_addr);
957
958 /* If we don't need a loop and have a suitable mode to describe the size,
959 just do a load / store pair and leave it up to the later lazy code
960 motion pass to insert the appropriate vsetvli. */
961 if (!need_loop && known_eq (GET_MODE_SIZE (vmode), INTVAL (length_in)))
962 {
963 emit_move_insn (vec, src);
964 emit_move_insn (dst, vec);
965 }
966 else
967 {
968 machine_mode mask_mode = riscv_vector::get_vector_mode
969 (BImode, GET_MODE_NUNITS (vmode)).require ();
970 rtx mask = CONSTM1_RTX (mask_mode);
971 if (!satisfies_constraint_K (cnt))
972 cnt= force_reg (Pmode, cnt);
973 rtx m_ops[] = {vec, mask, src};
974 emit_nonvlmax_insn (code_for_pred_mov (vmode),
975 riscv_vector::UNARY_OP_TAMA, m_ops, cnt);
976 emit_insn (gen_pred_store (vmode, dst, mask, vec, cnt,
977 get_avl_type_rtx (riscv_vector::NONVLMAX)));
978 }
979
980 if (need_loop)
981 {
982 emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, cnt)));
983 emit_insn (gen_rtx_SET (dst_addr, gen_rtx_PLUS (Pmode, dst_addr, cnt)));
984 emit_insn (gen_rtx_SET (length_rtx, gen_rtx_MINUS (Pmode, length_rtx, cnt)));
985
986 /* Emit the loop condition. */
987 rtx test = gen_rtx_NE (VOIDmode, end, const0_rtx);
988 emit_jump_insn (gen_cbranch4 (Pmode, test, length_rtx, const0_rtx, label));
989 emit_insn (gen_nop ());
990 }
991
992 return true;
993 }
994
995
996 /* Implement rawmemchr<mode> and strlen using vector instructions.
997 It can be assumed that the needle is in the haystack, otherwise the
998 behavior is undefined. */
999
1000 void
1001 expand_rawmemchr (machine_mode mode, rtx dst, rtx haystack, rtx needle,
1002 bool strlen)
1003 {
1004 /*
1005 rawmemchr:
1006 loop:
1007 vsetvli a1, zero, e[8,16,32,64], m1, ta, ma
1008 vle[8,16,32,64]ff.v v8, (a0) # Load.
1009 csrr a1, vl # Get number of bytes read.
1010 vmseq.vx v0, v8, pat # v0 = (v8 == {pat, pat, ...})
1011 vfirst.m a2, v0 # Find first hit.
1012 add a0, a0, a1 # Bump pointer.
1013 bltz a2, loop # Not found?
1014
1015 sub a0, a0, a1 # Go back by a1.
1016 shll a2, a2, [0,1,2,3] # Shift to get byte offset.
1017 add a0, a0, a2 # Add the offset.
1018
1019 ret
1020 */
1021 gcc_assert (TARGET_VECTOR);
1022
1023 if (strlen)
1024 gcc_assert (mode == E_QImode);
1025
1026 unsigned int isize = GET_MODE_SIZE (mode).to_constant ();
1027 int lmul = TARGET_MAX_LMUL;
1028 poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR * lmul, isize);
1029
1030 machine_mode vmode;
1031 if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode),
1032 nunits).exists (&vmode))
1033 gcc_unreachable ();
1034
1035 machine_mode mask_mode = riscv_vector::get_mask_mode (vmode);
1036
1037 rtx cnt = gen_reg_rtx (Pmode);
1038 emit_move_insn (cnt, CONST0_RTX (Pmode));
1039
1040 rtx end = gen_reg_rtx (Pmode);
1041 rtx vec = gen_reg_rtx (vmode);
1042 rtx mask = gen_reg_rtx (mask_mode);
1043
1044 /* After finding the first vector element matching the needle, we
1045 need to multiply by the vector element width (SEW) in order to
1046 return a pointer to the matching byte. */
1047 unsigned int shift = exact_log2 (GET_MODE_SIZE (mode).to_constant ());
1048
1049 rtx src_addr = copy_addr_to_reg (XEXP (haystack, 0));
1050 rtx start_addr = copy_addr_to_reg (XEXP (haystack, 0));
1051
1052 rtx loop = gen_label_rtx ();
1053 emit_label (loop);
1054
1055 rtx vsrc = change_address (haystack, vmode, src_addr);
1056
1057 /* Bump the pointer. */
1058 rtx step = gen_reg_rtx (Pmode);
1059 emit_insn (gen_rtx_SET (step, gen_rtx_ASHIFT (Pmode, cnt, GEN_INT (shift))));
1060 emit_insn (gen_rtx_SET (src_addr, gen_rtx_PLUS (Pmode, src_addr, step)));
1061
1062 /* Emit a first-fault load. */
1063 rtx vlops[] = {vec, vsrc};
1064 emit_vlmax_insn (code_for_pred_fault_load (vmode),
1065 riscv_vector::UNARY_OP, vlops);
1066
1067 /* Read how far we read. */
1068 if (Pmode == SImode)
1069 emit_insn (gen_read_vlsi (cnt));
1070 else
1071 emit_insn (gen_read_vldi_zero_extend (cnt));
1072
1073 /* Compare needle with haystack and store in a mask. */
1074 rtx eq = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, needle), vec);
1075 rtx vmsops[] = {mask, eq, vec, needle};
1076 emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode),
1077 riscv_vector::COMPARE_OP, vmsops, cnt);
1078
1079 /* Find the first bit in the mask. */
1080 rtx vfops[] = {end, mask};
1081 emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode),
1082 riscv_vector::CPOP_OP, vfops, cnt);
1083
1084 /* Emit the loop condition. */
1085 rtx test = gen_rtx_LT (VOIDmode, end, const0_rtx);
1086 emit_jump_insn (gen_cbranch4 (Pmode, test, end, const0_rtx, loop));
1087
1088 if (strlen)
1089 {
1090 /* For strlen, return the length. */
1091 emit_insn (gen_rtx_SET (dst, gen_rtx_PLUS (Pmode, src_addr, end)));
1092 emit_insn (gen_rtx_SET (dst, gen_rtx_MINUS (Pmode, dst, start_addr)));
1093 }
1094 else
1095 {
1096 /* For rawmemchr, return the position at SRC + END * [1,2,4,8]. */
1097 emit_insn (gen_rtx_SET (end, gen_rtx_ASHIFT (Pmode, end, GEN_INT (shift))));
1098 emit_insn (gen_rtx_SET (dst, gen_rtx_PLUS (Pmode, src_addr, end)));
1099 }
1100 }
1101
1102 /* Implement cmpstr<mode> using vector instructions. The ALIGNMENT and
1103 NCOMPARE parameters are unused for now. */
1104
1105 bool
1106 expand_strcmp (rtx result, rtx src1, rtx src2, rtx nbytes,
1107 unsigned HOST_WIDE_INT, bool)
1108 {
1109 gcc_assert (TARGET_VECTOR);
1110
1111 /* We don't support big endian. */
1112 if (BYTES_BIG_ENDIAN)
1113 return false;
1114
1115 bool with_length = nbytes != NULL_RTX;
1116
1117 if (with_length
1118 && (!REG_P (nbytes) && !SUBREG_P (nbytes) && !CONST_INT_P (nbytes)))
1119 return false;
1120
1121 if (with_length && CONST_INT_P (nbytes))
1122 nbytes = force_reg (Pmode, nbytes);
1123
1124 machine_mode mode = E_QImode;
1125 unsigned int isize = GET_MODE_SIZE (mode).to_constant ();
1126 int lmul = TARGET_MAX_LMUL;
1127 poly_int64 nunits = exact_div (BYTES_PER_RISCV_VECTOR * lmul, isize);
1128
1129 machine_mode vmode;
1130 if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode), nunits)
1131 .exists (&vmode))
1132 gcc_unreachable ();
1133
1134 machine_mode mask_mode = riscv_vector::get_mask_mode (vmode);
1135
1136 /* Prepare addresses. */
1137 rtx src_addr1 = copy_addr_to_reg (XEXP (src1, 0));
1138 rtx vsrc1 = change_address (src1, vmode, src_addr1);
1139
1140 rtx src_addr2 = copy_addr_to_reg (XEXP (src2, 0));
1141 rtx vsrc2 = change_address (src2, vmode, src_addr2);
1142
1143 /* Set initial pointer bump to 0. */
1144 rtx cnt = gen_reg_rtx (Pmode);
1145 emit_move_insn (cnt, CONST0_RTX (Pmode));
1146
1147 rtx sub = gen_reg_rtx (Pmode);
1148 emit_move_insn (sub, CONST0_RTX (Pmode));
1149
1150 /* Create source vectors. */
1151 rtx vec1 = gen_reg_rtx (vmode);
1152 rtx vec2 = gen_reg_rtx (vmode);
1153
1154 rtx done = gen_label_rtx ();
1155 rtx loop = gen_label_rtx ();
1156 emit_label (loop);
1157
1158 /* Bump the pointers. */
1159 emit_insn (gen_rtx_SET (src_addr1, gen_rtx_PLUS (Pmode, src_addr1, cnt)));
1160 emit_insn (gen_rtx_SET (src_addr2, gen_rtx_PLUS (Pmode, src_addr2, cnt)));
1161
1162 rtx vlops1[] = {vec1, vsrc1};
1163 rtx vlops2[] = {vec2, vsrc2};
1164
1165 if (!with_length)
1166 {
1167 emit_vlmax_insn (code_for_pred_fault_load (vmode),
1168 riscv_vector::UNARY_OP, vlops1);
1169
1170 emit_vlmax_insn (code_for_pred_fault_load (vmode),
1171 riscv_vector::UNARY_OP, vlops2);
1172 }
1173 else
1174 {
1175 nbytes = gen_lowpart (Pmode, nbytes);
1176 emit_nonvlmax_insn (code_for_pred_fault_load (vmode),
1177 riscv_vector::UNARY_OP, vlops1, nbytes);
1178
1179 emit_nonvlmax_insn (code_for_pred_fault_load (vmode),
1180 riscv_vector::UNARY_OP, vlops2, nbytes);
1181 }
1182
1183 /* Read the vl for the next pointer bump. */
1184 if (Pmode == SImode)
1185 emit_insn (gen_read_vlsi (cnt));
1186 else
1187 emit_insn (gen_read_vldi_zero_extend (cnt));
1188
1189 if (with_length)
1190 {
1191 rtx test_done = gen_rtx_EQ (VOIDmode, cnt, const0_rtx);
1192 emit_jump_insn (gen_cbranch4 (Pmode, test_done, cnt, const0_rtx, done));
1193 emit_insn (gen_rtx_SET (nbytes, gen_rtx_MINUS (Pmode, nbytes, cnt)));
1194 }
1195
1196 /* Look for a \0 in the first string. */
1197 rtx mask0 = gen_reg_rtx (mask_mode);
1198 rtx eq0
1199 = gen_rtx_EQ (mask_mode, gen_const_vec_duplicate (vmode, CONST0_RTX (mode)),
1200 vec1);
1201 rtx vmsops1[] = {mask0, eq0, vec1, CONST0_RTX (mode)};
1202 emit_nonvlmax_insn (code_for_pred_eqne_scalar (vmode),
1203 riscv_vector::COMPARE_OP, vmsops1, cnt);
1204
1205 /* Look for vec1 != vec2 (includes vec2[i] == 0). */
1206 rtx maskne = gen_reg_rtx (mask_mode);
1207 rtx ne = gen_rtx_NE (mask_mode, vec1, vec2);
1208 rtx vmsops[] = {maskne, ne, vec1, vec2};
1209 emit_nonvlmax_insn (code_for_pred_cmp (vmode), riscv_vector::COMPARE_OP,
1210 vmsops, cnt);
1211
1212 /* Combine both masks into one. */
1213 rtx mask = gen_reg_rtx (mask_mode);
1214 rtx vmorops[] = {mask, mask0, maskne};
1215 emit_nonvlmax_insn (code_for_pred (IOR, mask_mode),
1216 riscv_vector::BINARY_MASK_OP, vmorops, cnt);
1217
1218 /* Find the first bit in the mask (the first unequal element). */
1219 rtx found_at = gen_reg_rtx (Pmode);
1220 rtx vfops[] = {found_at, mask};
1221 emit_nonvlmax_insn (code_for_pred_ffs (mask_mode, Pmode),
1222 riscv_vector::CPOP_OP, vfops, cnt);
1223
1224 /* Emit the loop condition. */
1225 rtx test = gen_rtx_LT (VOIDmode, found_at, const0_rtx);
1226 emit_jump_insn (gen_cbranch4 (Pmode, test, found_at, const0_rtx, loop));
1227
1228 /* Walk up to the difference point. */
1229 emit_insn (
1230 gen_rtx_SET (src_addr1, gen_rtx_PLUS (Pmode, src_addr1, found_at)));
1231 emit_insn (
1232 gen_rtx_SET (src_addr2, gen_rtx_PLUS (Pmode, src_addr2, found_at)));
1233
1234 /* Load the respective byte and compute the difference. */
1235 rtx c1 = gen_reg_rtx (Pmode);
1236 rtx c2 = gen_reg_rtx (Pmode);
1237
1238 do_load_from_addr (mode, c1, src_addr1, src1);
1239 do_load_from_addr (mode, c2, src_addr2, src2);
1240
1241 do_sub3 (sub, c1, c2);
1242
1243 if (with_length)
1244 emit_label (done);
1245
1246 emit_insn (gen_movsi (result, gen_lowpart (SImode, sub)));
1247 return true;
1248 }
1249
1250 }