#define HAVE_CHUNKMEMSET_1
#define HAVE_CHUNKMEMSET_2
-#define HAVE_CHUNKMEMSET_3
#define HAVE_CHUNKMEMSET_4
#define HAVE_CHUNKMEMSET_8
#define CHUNKMEMSET chunkmemset_neon
#define CHUNKMEMSET_SAFE chunkmemset_safe_neon
-uint8_t* CHUNKCOPY(uint8_t *out, uint8_t const *from, unsigned len);
-uint8_t* CHUNKUNROLL(uint8_t *out, unsigned *dist, unsigned *len);
-
-static inline uint8_t *chunkmemset_3(uint8_t *out, uint8_t *from, unsigned dist, unsigned len) {
- uint8x8x3_t chunks;
- unsigned sz = sizeof(chunks);
- if (len < sz) {
- out = CHUNKUNROLL(out, &dist, &len);
- return CHUNKCOPY(out, out - dist, len);
- }
-
- /* Load 3 bytes 'a,b,c' from FROM and duplicate across all lanes:
- chunks[0] = {a,a,a,a,a,a,a,a}
- chunks[1] = {b,b,b,b,b,b,b,b}
- chunks[2] = {c,c,c,c,c,c,c,c}. */
- chunks = vld3_dup_u8(from);
-
- unsigned rem = len % sz;
- len -= rem;
- while (len) {
- /* Store "a,b,c, ..., a,b,c". */
- vst3_u8(out, chunks);
- out += sz;
- len -= sz;
- }
-
- if (!rem)
- return out;
-
- /* Last, deal with the case when LEN is not a multiple of SZ. */
- out = CHUNKUNROLL(out, &dist, &rem);
- return CHUNKCOPY(out, out - dist, rem);
-}
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-
-#define HAVE_CHUNKMEMSET_6
-
-static inline uint8_t *chunkmemset_6(uint8_t *out, uint8_t *from, unsigned dist, unsigned len) {
- uint16x8x3_t chunks;
- unsigned sz = sizeof(chunks);
- if (len < sz) {
- out = CHUNKUNROLL(out, &dist, &len);
- return CHUNKCOPY(out, out - dist, len);
- }
-
- /* Load 6 bytes 'ab,cd,ef' from FROM and duplicate across all lanes:
- chunks[0] = {ab,ab,ab,ab,ab,ab,ab,ab}
- chunks[1] = {cd,cd,cd,cd,cd,cd,cd,cd}
- chunks[2] = {ef,ef,ef,ef,ef,ef,ef,ef}. */
- chunks = vld3q_dup_u16((unsigned short *)from);
-
- unsigned rem = len % sz;
- len -= rem;
- while (len) {
- /* Store "ab,cd,ef, ..., ab,cd,ef". */
- vst3q_u16((unsigned short *)out, chunks);
- out += sz;
- len -= sz;
- }
-
- if (!rem)
- return out;
-
- /* Last, deal with the case when LEN is not a multiple of SZ. */
- out = CHUNKUNROLL(out, &dist, &rem);
- return CHUNKCOPY(out, out - dist, rem);
-}
-
-#endif
-
static inline void loadchunk(uint8_t const *s, chunk_t *chunk) {
*chunk = vld1q_u8(s);
}
chunkmemset_2(from, &chunk);
} else
#endif
-#ifdef HAVE_CHUNKMEMSET_3
- if (dist == 3) {
- return chunkmemset_3(out, from, dist, len);
- } else
-#endif
#ifdef HAVE_CHUNKMEMSET_4
if (dist == 4) {
chunkmemset_4(from, &chunk);
} else
#endif
-#ifdef HAVE_CHUNKMEMSET_6
- if (dist == 6) {
- return chunkmemset_6(out, from, dist, len);
- } else
-#endif
#ifdef HAVE_CHUNKMEMSET_8
if (dist == 8) {
chunkmemset_8(from, &chunk);
#endif
if (dist == sz) {
loadchunk(from, &chunk);
+ } else if (dist < sz) {
+ unsigned char *end = out + len - 1;
+ while (len > dist) {
+ out = CHUNKCOPY_SAFE(out, from, dist, end);
+ len -= dist;
+ }
+ if (len > 0) {
+ out = CHUNKCOPY_SAFE(out, from, len, end);
+ }
+ return out;
} else {
out = CHUNKUNROLL(out, &dist, &len);
return CHUNKCOPY(out, out - dist, len);