typedef void gen_helper_ldst_whole(TCGv_ptr, TCGv, TCGv_env, TCGv_i32);
static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
- gen_helper_ldst_whole *fn,
- DisasContext *s)
+ uint32_t log2_esz, gen_helper_ldst_whole *fn,
+ DisasContext *s, bool is_load)
{
- TCGv_ptr dest;
- TCGv base;
- TCGv_i32 desc;
-
- uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
- data = FIELD_DP32(data, VDATA, VM, 1);
- dest = tcg_temp_new_ptr();
- desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
- s->cfg_ptr->vlenb, data));
-
- base = get_gpr(s, rs1, EXT_NONE);
- tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
-
mark_vs_dirty(s);
- fn(dest, base, tcg_env, desc);
+ /*
+ * Load/store multiple bytes per iteration.
+ * When possible do this atomically.
+ * Update vstart with the number of processed elements.
+ * Use the helper function if either:
+ * - vstart is not 0.
+ * - the target has 32 bit registers and we are loading/storing 64 bit long
+ * elements. This is to ensure that we process every element with a single
+ * memory instruction.
+ */
+
+ bool use_helper_fn = !(s->vstart_eq_zero) ||
+ (TCG_TARGET_REG_BITS == 32 && log2_esz == 3);
+
+ if (!use_helper_fn) {
+ TCGv addr = tcg_temp_new();
+ uint32_t size = s->cfg_ptr->vlenb * nf;
+ TCGv_i64 t8 = tcg_temp_new_i64();
+ TCGv_i32 t4 = tcg_temp_new_i32();
+ MemOp atomicity = MO_ATOM_NONE;
+ if (log2_esz == 0) {
+ atomicity = MO_ATOM_NONE;
+ } else {
+ atomicity = MO_ATOM_IFALIGN_PAIR;
+ }
+ if (TCG_TARGET_REG_BITS == 64) {
+ for (int i = 0; i < size; i += 8) {
+ addr = get_address(s, rs1, i);
+ if (is_load) {
+ tcg_gen_qemu_ld_i64(t8, addr, s->mem_idx,
+ MO_LE | MO_64 | atomicity);
+ tcg_gen_st_i64(t8, tcg_env, vreg_ofs(s, vd) + i);
+ } else {
+ tcg_gen_ld_i64(t8, tcg_env, vreg_ofs(s, vd) + i);
+ tcg_gen_qemu_st_i64(t8, addr, s->mem_idx,
+ MO_LE | MO_64 | atomicity);
+ }
+ if (i == size - 8) {
+ tcg_gen_movi_tl(cpu_vstart, 0);
+ } else {
+ tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 8 >> log2_esz);
+ }
+ }
+ } else {
+ for (int i = 0; i < size; i += 4) {
+ addr = get_address(s, rs1, i);
+ if (is_load) {
+ tcg_gen_qemu_ld_i32(t4, addr, s->mem_idx,
+ MO_LE | MO_32 | atomicity);
+ tcg_gen_st_i32(t4, tcg_env, vreg_ofs(s, vd) + i);
+ } else {
+ tcg_gen_ld_i32(t4, tcg_env, vreg_ofs(s, vd) + i);
+ tcg_gen_qemu_st_i32(t4, addr, s->mem_idx,
+ MO_LE | MO_32 | atomicity);
+ }
+ if (i == size - 4) {
+ tcg_gen_movi_tl(cpu_vstart, 0);
+ } else {
+ tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 4 >> log2_esz);
+ }
+ }
+ }
+ } else {
+ TCGv_ptr dest;
+ TCGv base;
+ TCGv_i32 desc;
+ uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
+ data = FIELD_DP32(data, VDATA, VM, 1);
+ dest = tcg_temp_new_ptr();
+ desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
+ s->cfg_ptr->vlenb, data));
+ base = get_gpr(s, rs1, EXT_NONE);
+ tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
+ fn(dest, base, tcg_env, desc);
+ }
finalize_rvv_inst(s);
return true;
* load and store whole register instructions ignore vtype and vl setting.
* Thus, we don't need to check vill bit. (Section 7.9)
*/
-#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF) \
-static bool trans_##NAME(DisasContext *s, arg_##NAME * a) \
-{ \
- if (require_rvv(s) && \
- QEMU_IS_ALIGNED(a->rd, ARG_NF)) { \
- return ldst_whole_trans(a->rd, a->rs1, ARG_NF, \
- gen_helper_##NAME, s); \
- } \
- return false; \
-}
-
-GEN_LDST_WHOLE_TRANS(vl1re8_v, 1)
-GEN_LDST_WHOLE_TRANS(vl1re16_v, 1)
-GEN_LDST_WHOLE_TRANS(vl1re32_v, 1)
-GEN_LDST_WHOLE_TRANS(vl1re64_v, 1)
-GEN_LDST_WHOLE_TRANS(vl2re8_v, 2)
-GEN_LDST_WHOLE_TRANS(vl2re16_v, 2)
-GEN_LDST_WHOLE_TRANS(vl2re32_v, 2)
-GEN_LDST_WHOLE_TRANS(vl2re64_v, 2)
-GEN_LDST_WHOLE_TRANS(vl4re8_v, 4)
-GEN_LDST_WHOLE_TRANS(vl4re16_v, 4)
-GEN_LDST_WHOLE_TRANS(vl4re32_v, 4)
-GEN_LDST_WHOLE_TRANS(vl4re64_v, 4)
-GEN_LDST_WHOLE_TRANS(vl8re8_v, 8)
-GEN_LDST_WHOLE_TRANS(vl8re16_v, 8)
-GEN_LDST_WHOLE_TRANS(vl8re32_v, 8)
-GEN_LDST_WHOLE_TRANS(vl8re64_v, 8)
+#define GEN_LDST_WHOLE_TRANS(NAME, ETYPE, ARG_NF, IS_LOAD) \
+static bool trans_##NAME(DisasContext *s, arg_##NAME * a) \
+{ \
+ if (require_rvv(s) && \
+ QEMU_IS_ALIGNED(a->rd, ARG_NF)) { \
+ return ldst_whole_trans(a->rd, a->rs1, ARG_NF, ctzl(sizeof(ETYPE)), \
+ gen_helper_##NAME, s, IS_LOAD); \
+ } \
+ return false; \
+}
+
+GEN_LDST_WHOLE_TRANS(vl1re8_v, int8_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl1re16_v, int16_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl1re32_v, int32_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl1re64_v, int64_t, 1, true)
+GEN_LDST_WHOLE_TRANS(vl2re8_v, int8_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl2re16_v, int16_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl2re32_v, int32_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl2re64_v, int64_t, 2, true)
+GEN_LDST_WHOLE_TRANS(vl4re8_v, int8_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl4re16_v, int16_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl4re32_v, int32_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl4re64_v, int64_t, 4, true)
+GEN_LDST_WHOLE_TRANS(vl8re8_v, int8_t, 8, true)
+GEN_LDST_WHOLE_TRANS(vl8re16_v, int16_t, 8, true)
+GEN_LDST_WHOLE_TRANS(vl8re32_v, int32_t, 8, true)
+GEN_LDST_WHOLE_TRANS(vl8re64_v, int64_t, 8, true)
/*
* The vector whole register store instructions are encoded similar to
* unmasked unit-stride store of elements with EEW=8.
*/
-GEN_LDST_WHOLE_TRANS(vs1r_v, 1)
-GEN_LDST_WHOLE_TRANS(vs2r_v, 2)
-GEN_LDST_WHOLE_TRANS(vs4r_v, 4)
-GEN_LDST_WHOLE_TRANS(vs8r_v, 8)
+GEN_LDST_WHOLE_TRANS(vs1r_v, int8_t, 1, false)
+GEN_LDST_WHOLE_TRANS(vs2r_v, int8_t, 2, false)
+GEN_LDST_WHOLE_TRANS(vs4r_v, int8_t, 4, false)
+GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false)
/*
*** Vector Integer Arithmetic Instructions