FCLAMP(sme2_bfclamp, bfloat16, H2)
#undef FCLAMP
+
+void HELPER(sme2_sel_b)(void *vd, void *vn, void *vm,
+ uint32_t png, uint32_t desc)
+{
+ int vl = simd_oprsz(desc);
+ int nreg = simd_data(desc);
+ int elements = vl / sizeof(uint8_t);
+ DecodeCounter p = decode_counter(png, vl, MO_8);
+
+ if (p.lg2_stride == 0) {
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint8_t *d = vd + r * sizeof(ARMVectorReg);
+ uint8_t *n = vn + r * sizeof(ARMVectorReg);
+ uint8_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, n, vl); /* all true */
+ } else if (elements <= split) {
+ memcpy(d, m, vl); /* all false */
+ } else {
+ for (int e = 0; e < split; e++) {
+ d[H1(e)] = m[H1(e)];
+ }
+ for (int e = split; e < elements; e++) {
+ d[H1(e)] = n[H1(e)];
+ }
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint8_t *d = vd + r * sizeof(ARMVectorReg);
+ uint8_t *n = vn + r * sizeof(ARMVectorReg);
+ uint8_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, m, vl); /* all false */
+ } else if (elements <= split) {
+ memcpy(d, n, vl); /* all true */
+ } else {
+ for (int e = 0; e < split; e++) {
+ d[H1(e)] = n[H1(e)];
+ }
+ for (int e = split; e < elements; e++) {
+ d[H1(e)] = m[H1(e)];
+ }
+ }
+ }
+ }
+ } else {
+ int estride = 1 << p.lg2_stride;
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint8_t *d = vd + r * sizeof(ARMVectorReg);
+ uint8_t *n = vn + r * sizeof(ARMVectorReg);
+ uint8_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+ int e = 0;
+
+ for (; e < MIN(split, elements); e++) {
+ d[H1(e)] = m[H1(e)];
+ }
+ for (; e < elements; e += estride) {
+ d[H1(e)] = n[H1(e)];
+ for (int i = 1; i < estride; i++) {
+ d[H1(e + i)] = m[H1(e + i)];
+ }
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint8_t *d = vd + r * sizeof(ARMVectorReg);
+ uint8_t *n = vn + r * sizeof(ARMVectorReg);
+ uint8_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+ int e = 0;
+
+ for (; e < MIN(split, elements); e += estride) {
+ d[H1(e)] = n[H1(e)];
+ for (int i = 1; i < estride; i++) {
+ d[H1(e + i)] = m[H1(e + i)];
+ }
+ }
+ for (; e < elements; e++) {
+ d[H1(e)] = m[H1(e)];
+ }
+ }
+ }
+ }
+}
+
+void HELPER(sme2_sel_h)(void *vd, void *vn, void *vm,
+ uint32_t png, uint32_t desc)
+{
+ int vl = simd_oprsz(desc);
+ int nreg = simd_data(desc);
+ int elements = vl / sizeof(uint16_t);
+ DecodeCounter p = decode_counter(png, vl, MO_16);
+
+ if (p.lg2_stride == 0) {
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint16_t *d = vd + r * sizeof(ARMVectorReg);
+ uint16_t *n = vn + r * sizeof(ARMVectorReg);
+ uint16_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, n, vl); /* all true */
+ } else if (elements <= split) {
+ memcpy(d, m, vl); /* all false */
+ } else {
+ for (int e = 0; e < split; e++) {
+ d[H2(e)] = m[H2(e)];
+ }
+ for (int e = split; e < elements; e++) {
+ d[H2(e)] = n[H2(e)];
+ }
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint16_t *d = vd + r * sizeof(ARMVectorReg);
+ uint16_t *n = vn + r * sizeof(ARMVectorReg);
+ uint16_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, m, vl); /* all false */
+ } else if (elements <= split) {
+ memcpy(d, n, vl); /* all true */
+ } else {
+ for (int e = 0; e < split; e++) {
+ d[H2(e)] = n[H2(e)];
+ }
+ for (int e = split; e < elements; e++) {
+ d[H2(e)] = m[H2(e)];
+ }
+ }
+ }
+ }
+ } else {
+ int estride = 1 << p.lg2_stride;
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint16_t *d = vd + r * sizeof(ARMVectorReg);
+ uint16_t *n = vn + r * sizeof(ARMVectorReg);
+ uint16_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+ int e = 0;
+
+ for (; e < MIN(split, elements); e++) {
+ d[H2(e)] = m[H2(e)];
+ }
+ for (; e < elements; e += estride) {
+ d[H2(e)] = n[H2(e)];
+ for (int i = 1; i < estride; i++) {
+ d[H2(e + i)] = m[H2(e + i)];
+ }
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint16_t *d = vd + r * sizeof(ARMVectorReg);
+ uint16_t *n = vn + r * sizeof(ARMVectorReg);
+ uint16_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+ int e = 0;
+
+ for (; e < MIN(split, elements); e += estride) {
+ d[H2(e)] = n[H2(e)];
+ for (int i = 1; i < estride; i++) {
+ d[H2(e + i)] = m[H2(e + i)];
+ }
+ }
+ for (; e < elements; e++) {
+ d[H2(e)] = m[H2(e)];
+ }
+ }
+ }
+ }
+}
+
+void HELPER(sme2_sel_s)(void *vd, void *vn, void *vm,
+ uint32_t png, uint32_t desc)
+{
+ int vl = simd_oprsz(desc);
+ int nreg = simd_data(desc);
+ int elements = vl / sizeof(uint32_t);
+ DecodeCounter p = decode_counter(png, vl, MO_32);
+
+ if (p.lg2_stride == 0) {
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint32_t *d = vd + r * sizeof(ARMVectorReg);
+ uint32_t *n = vn + r * sizeof(ARMVectorReg);
+ uint32_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, n, vl); /* all true */
+ } else if (elements <= split) {
+ memcpy(d, m, vl); /* all false */
+ } else {
+ for (int e = 0; e < split; e++) {
+ d[H4(e)] = m[H4(e)];
+ }
+ for (int e = split; e < elements; e++) {
+ d[H4(e)] = n[H4(e)];
+ }
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint32_t *d = vd + r * sizeof(ARMVectorReg);
+ uint32_t *n = vn + r * sizeof(ARMVectorReg);
+ uint32_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, m, vl); /* all false */
+ } else if (elements <= split) {
+ memcpy(d, n, vl); /* all true */
+ } else {
+ for (int e = 0; e < split; e++) {
+ d[H4(e)] = n[H4(e)];
+ }
+ for (int e = split; e < elements; e++) {
+ d[H4(e)] = m[H4(e)];
+ }
+ }
+ }
+ }
+ } else {
+ /* p.esz must be MO_64, so stride must be 2. */
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint32_t *d = vd + r * sizeof(ARMVectorReg);
+ uint32_t *n = vn + r * sizeof(ARMVectorReg);
+ uint32_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+ int e = 0;
+
+ for (; e < MIN(split, elements); e++) {
+ d[H4(e)] = m[H4(e)];
+ }
+ for (; e < elements; e += 2) {
+ d[H4(e)] = n[H4(e)];
+ d[H4(e + 1)] = m[H4(e + 1)];
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint32_t *d = vd + r * sizeof(ARMVectorReg);
+ uint32_t *n = vn + r * sizeof(ARMVectorReg);
+ uint32_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+ int e = 0;
+
+ for (; e < MIN(split, elements); e += 2) {
+ d[H4(e)] = n[H4(e)];
+ d[H4(e + 1)] = m[H4(e + 1)];
+ }
+ for (; e < elements; e++) {
+ d[H4(e)] = m[H4(e)];
+ }
+ }
+ }
+ }
+}
+
+void HELPER(sme2_sel_d)(void *vd, void *vn, void *vm,
+ uint32_t png, uint32_t desc)
+{
+ int vl = simd_oprsz(desc);
+ int nreg = simd_data(desc);
+ int elements = vl / sizeof(uint64_t);
+ DecodeCounter p = decode_counter(png, vl, MO_64);
+
+ if (p.invert) {
+ for (int r = 0; r < nreg; r++) {
+ uint64_t *d = vd + r * sizeof(ARMVectorReg);
+ uint64_t *n = vn + r * sizeof(ARMVectorReg);
+ uint64_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, n, vl); /* all true */
+ } else if (elements <= split) {
+ memcpy(d, m, vl); /* all false */
+ } else {
+ memcpy(d, m, split * sizeof(uint64_t));
+ memcpy(d + split, n + split,
+ (elements - split) * sizeof(uint64_t));
+ }
+ }
+ } else {
+ for (int r = 0; r < nreg; r++) {
+ uint64_t *d = vd + r * sizeof(ARMVectorReg);
+ uint64_t *n = vn + r * sizeof(ARMVectorReg);
+ uint64_t *m = vm + r * sizeof(ARMVectorReg);
+ int split = p.count - r * elements;
+
+ if (split <= 0) {
+ memcpy(d, m, vl); /* all false */
+ } else if (elements <= split) {
+ memcpy(d, n, vl); /* all true */
+ } else {
+ memcpy(d, n, split * sizeof(uint64_t));
+ memcpy(d + split, m + split,
+ (elements - split) * sizeof(uint64_t));
+ }
+ }
+ }
+}