emit_insn (gen_aarch64_tpidr2_save ());
emit_insn (gen_aarch64_clear_tpidr2 ());
if (mode == aarch64_local_sme_state::ACTIVE_LIVE
- || mode == aarch64_local_sme_state::ACTIVE_DEAD)
+ || mode == aarch64_local_sme_state::ACTIVE_DEAD
+ || mode == aarch64_local_sme_state::INACTIVE_LOCAL)
{
if (aarch64_cfun_has_state ("za"))
emit_insn (gen_aarch64_initial_zero_za ());
if (mode == aarch64_local_sme_state::INACTIVE_LOCAL)
{
+ if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
+ /* Enable ZA (if it wasn't already enabled on entry). Enabling ZA has
+ the side-effect of zeroing ZA.
+
+ A functionally correct alternative would be to leave TPIDR2_EL0 null
+ and zero the save buffer. However, zeroing the save buffer would require
+ more code and would optimize for the case in which a callee also
+ initialises private ZA state (which should be a rare event). */
+ emit_insn (gen_aarch64_smstart_za ());
+
if (prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
|| prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
|| prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
--- /dev/null
+// { dg-options "-O -fomit-frame-pointer -fno-optimize-sibling-calls" }
+// { dg-final { check-function-bodies "**" "" } }
+
+#include <arm_sme.h>
+
+void callee_ns();
+__arm_streaming __arm_inout("za") void callee_s();
+
+/*
+** foo:
+** ...
+** smstart za
+** ...
+** msr tpidr2_el0, x\d+
+** ...
+*/
+__arm_locally_streaming __arm_new("za") const float * foo(const float* x) {
+ callee_ns ();
+ const float32_t *x_f_in = x;
+ svzero_za();
+ callee_s ();
+ return x_f_in;
+}
+
+