]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.9-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 17 Apr 2018 12:10:21 +0000 (14:10 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 17 Apr 2018 12:10:21 +0000 (14:10 +0200)
added patches:
arm-arm64-kvm-add-psci_version-helper.patch
arm-arm64-kvm-add-smccc-accessors-to-psci-code.patch
arm-arm64-kvm-advertise-smccc-v1.1.patch
arm-arm64-kvm-consolidate-the-psci-include-files.patch
arm-arm64-kvm-implement-psci-1.0-support.patch
arm-arm64-kvm-turn-kvm_psci_version-into-a-static-inline.patch
arm-arm64-smccc-implement-smccc-v1.1-inline-primitive.patch
arm-arm64-smccc-make-function-identifiers-an-unsigned-quantity.patch
arm64-add-arm_smccc_arch_workaround_1-bp-hardening-support.patch
arm64-add-skeleton-to-harden-the-branch-predictor-against-aliasing-attacks.patch
arm64-barrier-add-csdb-macros-to-control-data-value-prediction.patch
arm64-branch-predictor-hardening-for-cavium-thunderx2.patch
arm64-cpu_errata-allow-an-erratum-to-be-match-for-all-revisions-of-a-core.patch
arm64-cpufeature-__this_cpu_has_cap-shouldn-t-stop-early.patch
arm64-cpufeature-pass-capability-structure-to-enable-callback.patch
arm64-cputype-add-missing-midr-values-for-cortex-a72-and-cortex-a75.patch
arm64-entry-apply-bp-hardening-for-high-priority-synchronous-exceptions.patch
arm64-entry-apply-bp-hardening-for-suspicious-interrupts-from-el0.patch
arm64-entry-ensure-branch-through-syscall-table-is-bounded-under-speculation.patch
arm64-factor-out-ttbr0_el1-post-update-workaround-into-a-specific-asm-macro.patch
arm64-implement-array_index_mask_nospec.patch
arm64-implement-branch-predictor-hardening-for-affected-cortex-a-cpus.patch
arm64-kill-psci_get_version-as-a-variant-2-workaround.patch
arm64-kvm-add-smccc_arch_workaround_1-fast-handling.patch
arm64-kvm-increment-pc-after-handling-an-smc-trap.patch
arm64-kvm-make-psci_version-a-fast-path.patch
arm64-kvm-report-smccc_arch_workaround_1-bp-hardening-support.patch
arm64-kvm-use-per-cpu-vector-when-bp-hardening-is-enabled.patch
arm64-make-user_ds-an-inclusive-limit.patch
arm64-move-bp-hardening-to-check_and_switch_context.patch
arm64-move-post_ttbr_update_workaround-to-c-code.patch
arm64-move-task_-definitions-to-asm-processor.h.patch
arm64-run-enable-method-for-errata-work-arounds-on-late-cpus.patch
arm64-uaccess-don-t-bother-eliding-access_ok-checks-in-__-get-put-_user.patch
arm64-uaccess-mask-__user-pointers-for-__arch_-clear-copy_-_user.patch
arm64-uaccess-prevent-speculative-use-of-the-current-addr_limit.patch
arm64-use-pointer-masking-to-limit-uaccess-speculation.patch
drivers-firmware-expose-psci_get_version-through-psci_ops-structure.patch
firmware-psci-expose-psci-conduit.patch
firmware-psci-expose-smccc-version-through-psci_ops.patch
mm-introduce-lm_alias.patch

42 files changed:
queue-4.9/arm-arm64-kvm-add-psci_version-helper.patch [new file with mode: 0644]
queue-4.9/arm-arm64-kvm-add-smccc-accessors-to-psci-code.patch [new file with mode: 0644]
queue-4.9/arm-arm64-kvm-advertise-smccc-v1.1.patch [new file with mode: 0644]
queue-4.9/arm-arm64-kvm-consolidate-the-psci-include-files.patch [new file with mode: 0644]
queue-4.9/arm-arm64-kvm-implement-psci-1.0-support.patch [new file with mode: 0644]
queue-4.9/arm-arm64-kvm-turn-kvm_psci_version-into-a-static-inline.patch [new file with mode: 0644]
queue-4.9/arm-arm64-smccc-implement-smccc-v1.1-inline-primitive.patch [new file with mode: 0644]
queue-4.9/arm-arm64-smccc-make-function-identifiers-an-unsigned-quantity.patch [new file with mode: 0644]
queue-4.9/arm64-add-arm_smccc_arch_workaround_1-bp-hardening-support.patch [new file with mode: 0644]
queue-4.9/arm64-add-skeleton-to-harden-the-branch-predictor-against-aliasing-attacks.patch [new file with mode: 0644]
queue-4.9/arm64-barrier-add-csdb-macros-to-control-data-value-prediction.patch [new file with mode: 0644]
queue-4.9/arm64-branch-predictor-hardening-for-cavium-thunderx2.patch [new file with mode: 0644]
queue-4.9/arm64-cpu_errata-allow-an-erratum-to-be-match-for-all-revisions-of-a-core.patch [new file with mode: 0644]
queue-4.9/arm64-cpufeature-__this_cpu_has_cap-shouldn-t-stop-early.patch [new file with mode: 0644]
queue-4.9/arm64-cpufeature-pass-capability-structure-to-enable-callback.patch [new file with mode: 0644]
queue-4.9/arm64-cputype-add-missing-midr-values-for-cortex-a72-and-cortex-a75.patch [new file with mode: 0644]
queue-4.9/arm64-entry-apply-bp-hardening-for-high-priority-synchronous-exceptions.patch [new file with mode: 0644]
queue-4.9/arm64-entry-apply-bp-hardening-for-suspicious-interrupts-from-el0.patch [new file with mode: 0644]
queue-4.9/arm64-entry-ensure-branch-through-syscall-table-is-bounded-under-speculation.patch [new file with mode: 0644]
queue-4.9/arm64-factor-out-ttbr0_el1-post-update-workaround-into-a-specific-asm-macro.patch [new file with mode: 0644]
queue-4.9/arm64-implement-array_index_mask_nospec.patch [new file with mode: 0644]
queue-4.9/arm64-implement-branch-predictor-hardening-for-affected-cortex-a-cpus.patch [new file with mode: 0644]
queue-4.9/arm64-kill-psci_get_version-as-a-variant-2-workaround.patch [new file with mode: 0644]
queue-4.9/arm64-kvm-add-smccc_arch_workaround_1-fast-handling.patch [new file with mode: 0644]
queue-4.9/arm64-kvm-increment-pc-after-handling-an-smc-trap.patch [new file with mode: 0644]
queue-4.9/arm64-kvm-make-psci_version-a-fast-path.patch [new file with mode: 0644]
queue-4.9/arm64-kvm-report-smccc_arch_workaround_1-bp-hardening-support.patch [new file with mode: 0644]
queue-4.9/arm64-kvm-use-per-cpu-vector-when-bp-hardening-is-enabled.patch [new file with mode: 0644]
queue-4.9/arm64-make-user_ds-an-inclusive-limit.patch [new file with mode: 0644]
queue-4.9/arm64-move-bp-hardening-to-check_and_switch_context.patch [new file with mode: 0644]
queue-4.9/arm64-move-post_ttbr_update_workaround-to-c-code.patch [new file with mode: 0644]
queue-4.9/arm64-move-task_-definitions-to-asm-processor.h.patch [new file with mode: 0644]
queue-4.9/arm64-run-enable-method-for-errata-work-arounds-on-late-cpus.patch [new file with mode: 0644]
queue-4.9/arm64-uaccess-don-t-bother-eliding-access_ok-checks-in-__-get-put-_user.patch [new file with mode: 0644]
queue-4.9/arm64-uaccess-mask-__user-pointers-for-__arch_-clear-copy_-_user.patch [new file with mode: 0644]
queue-4.9/arm64-uaccess-prevent-speculative-use-of-the-current-addr_limit.patch [new file with mode: 0644]
queue-4.9/arm64-use-pointer-masking-to-limit-uaccess-speculation.patch [new file with mode: 0644]
queue-4.9/drivers-firmware-expose-psci_get_version-through-psci_ops-structure.patch [new file with mode: 0644]
queue-4.9/firmware-psci-expose-psci-conduit.patch [new file with mode: 0644]
queue-4.9/firmware-psci-expose-smccc-version-through-psci_ops.patch [new file with mode: 0644]
queue-4.9/mm-introduce-lm_alias.patch [new file with mode: 0644]
queue-4.9/series

diff --git a/queue-4.9/arm-arm64-kvm-add-psci_version-helper.patch b/queue-4.9/arm-arm64-kvm-add-psci_version-helper.patch
new file mode 100644 (file)
index 0000000..895a397
--- /dev/null
@@ -0,0 +1,80 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:25 +0100
+Subject: [PATCH v4.9.y 29/42] arm/arm64: KVM: Add PSCI_VERSION helper
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-30-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit d0a144f12a7ca8368933eae6583c096c363ec506 upstream.
+
+As we're about to trigger a PSCI version explosion, it doesn't
+hurt to introduce a PSCI_VERSION helper that is going to be
+used everywhere.
+
+Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+[v4.9: account for files moved to virt/ upstream]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/kvm/psci.c       |    4 +---
+ include/kvm/arm_psci.h    |    6 ++++--
+ include/uapi/linux/psci.h |    3 +++
+ 3 files changed, 8 insertions(+), 5 deletions(-)
+
+--- a/arch/arm/kvm/psci.c
++++ b/arch/arm/kvm/psci.c
+@@ -25,8 +25,6 @@
+ #include <kvm/arm_psci.h>
+-#include <uapi/linux/psci.h>
+-
+ /*
+  * This is an implementation of the Power State Coordination Interface
+  * as described in ARM document number ARM DEN 0022A.
+@@ -220,7 +218,7 @@ static int kvm_psci_0_2_call(struct kvm_
+                * Bits[31:16] = Major Version = 0
+                * Bits[15:0] = Minor Version = 2
+                */
+-              val = 2;
++              val = KVM_ARM_PSCI_0_2;
+               break;
+       case PSCI_0_2_FN_CPU_SUSPEND:
+       case PSCI_0_2_FN64_CPU_SUSPEND:
+--- a/include/kvm/arm_psci.h
++++ b/include/kvm/arm_psci.h
+@@ -18,8 +18,10 @@
+ #ifndef __KVM_ARM_PSCI_H__
+ #define __KVM_ARM_PSCI_H__
+-#define KVM_ARM_PSCI_0_1      1
+-#define KVM_ARM_PSCI_0_2      2
++#include <uapi/linux/psci.h>
++
++#define KVM_ARM_PSCI_0_1      PSCI_VERSION(0, 1)
++#define KVM_ARM_PSCI_0_2      PSCI_VERSION(0, 2)
+ int kvm_psci_version(struct kvm_vcpu *vcpu);
+ int kvm_psci_call(struct kvm_vcpu *vcpu);
+--- a/include/uapi/linux/psci.h
++++ b/include/uapi/linux/psci.h
+@@ -87,6 +87,9 @@
+               (((ver) & PSCI_VERSION_MAJOR_MASK) >> PSCI_VERSION_MAJOR_SHIFT)
+ #define PSCI_VERSION_MINOR(ver)                       \
+               ((ver) & PSCI_VERSION_MINOR_MASK)
++#define PSCI_VERSION(maj, min)                                                \
++      ((((maj) << PSCI_VERSION_MAJOR_SHIFT) & PSCI_VERSION_MAJOR_MASK) | \
++       ((min) & PSCI_VERSION_MINOR_MASK))
+ /* PSCI features decoding (>=1.0) */
+ #define PSCI_1_0_FEATURES_CPU_SUSPEND_PF_SHIFT        1
diff --git a/queue-4.9/arm-arm64-kvm-add-smccc-accessors-to-psci-code.patch b/queue-4.9/arm-arm64-kvm-add-smccc-accessors-to-psci-code.patch
new file mode 100644 (file)
index 0000000..4007079
--- /dev/null
@@ -0,0 +1,146 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:26 +0100
+Subject: [PATCH v4.9.y 30/42] arm/arm64: KVM: Add smccc accessors to PSCI code
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-31-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 84684fecd7ea381824a96634a027b7719587fb77 upstream.
+
+Instead of open coding the accesses to the various registers,
+let's add explicit SMCCC accessors.
+
+Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+[v4.9: account for files moved to virt/ upstream]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/kvm/psci.c |   52 ++++++++++++++++++++++++++++++++++++++++++----------
+ 1 file changed, 42 insertions(+), 10 deletions(-)
+
+--- a/arch/arm/kvm/psci.c
++++ b/arch/arm/kvm/psci.c
+@@ -32,6 +32,38 @@
+ #define AFFINITY_MASK(level)  ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)
++static u32 smccc_get_function(struct kvm_vcpu *vcpu)
++{
++      return vcpu_get_reg(vcpu, 0);
++}
++
++static unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu)
++{
++      return vcpu_get_reg(vcpu, 1);
++}
++
++static unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu)
++{
++      return vcpu_get_reg(vcpu, 2);
++}
++
++static unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu)
++{
++      return vcpu_get_reg(vcpu, 3);
++}
++
++static void smccc_set_retval(struct kvm_vcpu *vcpu,
++                           unsigned long a0,
++                           unsigned long a1,
++                           unsigned long a2,
++                           unsigned long a3)
++{
++      vcpu_set_reg(vcpu, 0, a0);
++      vcpu_set_reg(vcpu, 1, a1);
++      vcpu_set_reg(vcpu, 2, a2);
++      vcpu_set_reg(vcpu, 3, a3);
++}
++
+ static unsigned long psci_affinity_mask(unsigned long affinity_level)
+ {
+       if (affinity_level <= 3)
+@@ -74,7 +106,7 @@ static unsigned long kvm_psci_vcpu_on(st
+       unsigned long context_id;
+       phys_addr_t target_pc;
+-      cpu_id = vcpu_get_reg(source_vcpu, 1) & MPIDR_HWID_BITMASK;
++      cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK;
+       if (vcpu_mode_is_32bit(source_vcpu))
+               cpu_id &= ~((u32) 0);
+@@ -93,8 +125,8 @@ static unsigned long kvm_psci_vcpu_on(st
+                       return PSCI_RET_INVALID_PARAMS;
+       }
+-      target_pc = vcpu_get_reg(source_vcpu, 2);
+-      context_id = vcpu_get_reg(source_vcpu, 3);
++      target_pc = smccc_get_arg2(source_vcpu);
++      context_id = smccc_get_arg3(source_vcpu);
+       kvm_reset_vcpu(vcpu);
+@@ -113,7 +145,7 @@ static unsigned long kvm_psci_vcpu_on(st
+        * NOTE: We always update r0 (or x0) because for PSCI v0.1
+        * the general puspose registers are undefined upon CPU_ON.
+        */
+-      vcpu_set_reg(vcpu, 0, context_id);
++      smccc_set_retval(vcpu, context_id, 0, 0, 0);
+       vcpu->arch.power_off = false;
+       smp_mb();               /* Make sure the above is visible */
+@@ -133,8 +165,8 @@ static unsigned long kvm_psci_vcpu_affin
+       struct kvm *kvm = vcpu->kvm;
+       struct kvm_vcpu *tmp;
+-      target_affinity = vcpu_get_reg(vcpu, 1);
+-      lowest_affinity_level = vcpu_get_reg(vcpu, 2);
++      target_affinity = smccc_get_arg1(vcpu);
++      lowest_affinity_level = smccc_get_arg2(vcpu);
+       /* Determine target affinity mask */
+       target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
+@@ -208,7 +240,7 @@ int kvm_psci_version(struct kvm_vcpu *vc
+ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
+ {
+       struct kvm *kvm = vcpu->kvm;
+-      unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0);
++      unsigned long psci_fn = smccc_get_function(vcpu);
+       unsigned long val;
+       int ret = 1;
+@@ -275,14 +307,14 @@ static int kvm_psci_0_2_call(struct kvm_
+               break;
+       }
+-      vcpu_set_reg(vcpu, 0, val);
++      smccc_set_retval(vcpu, val, 0, 0, 0);
+       return ret;
+ }
+ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
+ {
+       struct kvm *kvm = vcpu->kvm;
+-      unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0);
++      unsigned long psci_fn = smccc_get_function(vcpu);
+       unsigned long val;
+       switch (psci_fn) {
+@@ -300,7 +332,7 @@ static int kvm_psci_0_1_call(struct kvm_
+               break;
+       }
+-      vcpu_set_reg(vcpu, 0, val);
++      smccc_set_retval(vcpu, val, 0, 0, 0);
+       return 1;
+ }
diff --git a/queue-4.9/arm-arm64-kvm-advertise-smccc-v1.1.patch b/queue-4.9/arm-arm64-kvm-advertise-smccc-v1.1.patch
new file mode 100644 (file)
index 0000000..dc9385f
--- /dev/null
@@ -0,0 +1,142 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:28 +0100
+Subject: [PATCH v4.9.y 32/42] arm/arm64: KVM: Advertise SMCCC v1.1
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-33-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 09e6be12effdb33bf7210c8867bbd213b66a499e upstream.
+
+The new SMC Calling Convention (v1.1) allows for a reduced overhead
+when calling into the firmware, and provides a new feature discovery
+mechanism.
+
+Make it visible to KVM guests.
+
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+[v4.9: account for files moved to virt/ upstream]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/kvm/handle_exit.c   |    2 +-
+ arch/arm/kvm/psci.c          |   24 +++++++++++++++++++++++-
+ arch/arm64/kvm/handle_exit.c |    2 +-
+ include/kvm/arm_psci.h       |    2 +-
+ include/linux/arm-smccc.h    |   13 +++++++++++++
+ 5 files changed, 39 insertions(+), 4 deletions(-)
+
+--- a/arch/arm/kvm/handle_exit.c
++++ b/arch/arm/kvm/handle_exit.c
+@@ -36,7 +36,7 @@ static int handle_hvc(struct kvm_vcpu *v
+                     kvm_vcpu_hvc_get_imm(vcpu));
+       vcpu->stat.hvc_exit_stat++;
+-      ret = kvm_psci_call(vcpu);
++      ret = kvm_hvc_call_handler(vcpu);
+       if (ret < 0) {
+               vcpu_set_reg(vcpu, 0, ~0UL);
+               return 1;
+--- a/arch/arm/kvm/psci.c
++++ b/arch/arm/kvm/psci.c
+@@ -15,6 +15,7 @@
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  */
++#include <linux/arm-smccc.h>
+ #include <linux/preempt.h>
+ #include <linux/kvm_host.h>
+ #include <linux/wait.h>
+@@ -337,6 +338,7 @@ static int kvm_psci_1_0_call(struct kvm_
+               case PSCI_0_2_FN_SYSTEM_OFF:
+               case PSCI_0_2_FN_SYSTEM_RESET:
+               case PSCI_1_0_FN_PSCI_FEATURES:
++              case ARM_SMCCC_VERSION_FUNC_ID:
+                       val = 0;
+                       break;
+               default:
+@@ -391,7 +393,7 @@ static int kvm_psci_0_1_call(struct kvm_
+  * Errors:
+  * -EINVAL: Unrecognized PSCI function
+  */
+-int kvm_psci_call(struct kvm_vcpu *vcpu)
++static int kvm_psci_call(struct kvm_vcpu *vcpu)
+ {
+       switch (kvm_psci_version(vcpu)) {
+       case KVM_ARM_PSCI_1_0:
+@@ -404,3 +406,23 @@ int kvm_psci_call(struct kvm_vcpu *vcpu)
+               return -EINVAL;
+       };
+ }
++
++int kvm_hvc_call_handler(struct kvm_vcpu *vcpu)
++{
++      u32 func_id = smccc_get_function(vcpu);
++      u32 val = PSCI_RET_NOT_SUPPORTED;
++
++      switch (func_id) {
++      case ARM_SMCCC_VERSION_FUNC_ID:
++              val = ARM_SMCCC_VERSION_1_1;
++              break;
++      case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
++              /* Nothing supported yet */
++              break;
++      default:
++              return kvm_psci_call(vcpu);
++      }
++
++      smccc_set_retval(vcpu, val, 0, 0, 0);
++      return 1;
++}
+--- a/arch/arm64/kvm/handle_exit.c
++++ b/arch/arm64/kvm/handle_exit.c
+@@ -45,7 +45,7 @@ static int handle_hvc(struct kvm_vcpu *v
+                           kvm_vcpu_hvc_get_imm(vcpu));
+       vcpu->stat.hvc_exit_stat++;
+-      ret = kvm_psci_call(vcpu);
++      ret = kvm_hvc_call_handler(vcpu);
+       if (ret < 0) {
+               vcpu_set_reg(vcpu, 0, ~0UL);
+               return 1;
+--- a/include/kvm/arm_psci.h
++++ b/include/kvm/arm_psci.h
+@@ -27,6 +27,6 @@
+ #define KVM_ARM_PSCI_LATEST   KVM_ARM_PSCI_1_0
+ int kvm_psci_version(struct kvm_vcpu *vcpu);
+-int kvm_psci_call(struct kvm_vcpu *vcpu);
++int kvm_hvc_call_handler(struct kvm_vcpu *vcpu);
+ #endif /* __KVM_ARM_PSCI_H__ */
+--- a/include/linux/arm-smccc.h
++++ b/include/linux/arm-smccc.h
+@@ -60,6 +60,19 @@
+ #define ARM_SMCCC_QUIRK_NONE          0
+ #define ARM_SMCCC_QUIRK_QCOM_A6               1 /* Save/restore register a6 */
++#define ARM_SMCCC_VERSION_1_0         0x10000
++#define ARM_SMCCC_VERSION_1_1         0x10001
++
++#define ARM_SMCCC_VERSION_FUNC_ID                                     \
++      ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
++                         ARM_SMCCC_SMC_32,                            \
++                         0, 0)
++
++#define ARM_SMCCC_ARCH_FEATURES_FUNC_ID                                       \
++      ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
++                         ARM_SMCCC_SMC_32,                            \
++                         0, 1)
++
+ #ifndef __ASSEMBLY__
+ #include <linux/linkage.h>
diff --git a/queue-4.9/arm-arm64-kvm-consolidate-the-psci-include-files.patch b/queue-4.9/arm-arm64-kvm-consolidate-the-psci-include-files.patch
new file mode 100644 (file)
index 0000000..854ddd9
--- /dev/null
@@ -0,0 +1,191 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:24 +0100
+Subject: [PATCH v4.9.y 28/42] arm/arm64: KVM: Consolidate the PSCI include files
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-29-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 1a2fb94e6a771ff94f4afa22497a4695187b820c upstream.
+
+As we're about to update the PSCI support, and because I'm lazy,
+let's move the PSCI include file to include/kvm so that both
+ARM architectures can find it.
+
+Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+[v4.9: account for files moved to virt/ upstream]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/include/asm/kvm_psci.h   |   27 ---------------------------
+ arch/arm/kvm/arm.c                |    2 +-
+ arch/arm/kvm/handle_exit.c        |    2 +-
+ arch/arm/kvm/psci.c               |    3 ++-
+ arch/arm64/include/asm/kvm_psci.h |   27 ---------------------------
+ arch/arm64/kvm/handle_exit.c      |    5 ++++-
+ include/kvm/arm_psci.h            |   27 +++++++++++++++++++++++++++
+ 7 files changed, 35 insertions(+), 58 deletions(-)
+ delete mode 100644 arch/arm/include/asm/kvm_psci.h
+ rename arch/arm64/include/asm/kvm_psci.h => include/kvm/arm_psci.h (89%)
+
+--- a/arch/arm/include/asm/kvm_psci.h
++++ /dev/null
+@@ -1,27 +0,0 @@
+-/*
+- * Copyright (C) 2012 - ARM Ltd
+- * Author: Marc Zyngier <marc.zyngier@arm.com>
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the GNU General Public License version 2 as
+- * published by the Free Software Foundation.
+- *
+- * This program is distributed in the hope that it will be useful,
+- * but WITHOUT ANY WARRANTY; without even the implied warranty of
+- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+- * GNU General Public License for more details.
+- *
+- * You should have received a copy of the GNU General Public License
+- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+- */
+-
+-#ifndef __ARM_KVM_PSCI_H__
+-#define __ARM_KVM_PSCI_H__
+-
+-#define KVM_ARM_PSCI_0_1      1
+-#define KVM_ARM_PSCI_0_2      2
+-
+-int kvm_psci_version(struct kvm_vcpu *vcpu);
+-int kvm_psci_call(struct kvm_vcpu *vcpu);
+-
+-#endif /* __ARM_KVM_PSCI_H__ */
+--- a/arch/arm/kvm/arm.c
++++ b/arch/arm/kvm/arm.c
+@@ -29,6 +29,7 @@
+ #include <linux/kvm.h>
+ #include <trace/events/kvm.h>
+ #include <kvm/arm_pmu.h>
++#include <kvm/arm_psci.h>
+ #define CREATE_TRACE_POINTS
+ #include "trace.h"
+@@ -44,7 +45,6 @@
+ #include <asm/kvm_mmu.h>
+ #include <asm/kvm_emulate.h>
+ #include <asm/kvm_coproc.h>
+-#include <asm/kvm_psci.h>
+ #include <asm/sections.h>
+ #ifdef REQUIRES_VIRT
+--- a/arch/arm/kvm/handle_exit.c
++++ b/arch/arm/kvm/handle_exit.c
+@@ -21,7 +21,7 @@
+ #include <asm/kvm_emulate.h>
+ #include <asm/kvm_coproc.h>
+ #include <asm/kvm_mmu.h>
+-#include <asm/kvm_psci.h>
++#include <kvm/arm_psci.h>
+ #include <trace/events/kvm.h>
+ #include "trace.h"
+--- a/arch/arm/kvm/psci.c
++++ b/arch/arm/kvm/psci.c
+@@ -21,9 +21,10 @@
+ #include <asm/cputype.h>
+ #include <asm/kvm_emulate.h>
+-#include <asm/kvm_psci.h>
+ #include <asm/kvm_host.h>
++#include <kvm/arm_psci.h>
++
+ #include <uapi/linux/psci.h>
+ /*
+--- a/arch/arm64/include/asm/kvm_psci.h
++++ /dev/null
+@@ -1,27 +0,0 @@
+-/*
+- * Copyright (C) 2012,2013 - ARM Ltd
+- * Author: Marc Zyngier <marc.zyngier@arm.com>
+- *
+- * This program is free software; you can redistribute it and/or modify
+- * it under the terms of the GNU General Public License version 2 as
+- * published by the Free Software Foundation.
+- *
+- * This program is distributed in the hope that it will be useful,
+- * but WITHOUT ANY WARRANTY; without even the implied warranty of
+- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+- * GNU General Public License for more details.
+- *
+- * You should have received a copy of the GNU General Public License
+- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+- */
+-
+-#ifndef __ARM64_KVM_PSCI_H__
+-#define __ARM64_KVM_PSCI_H__
+-
+-#define KVM_ARM_PSCI_0_1      1
+-#define KVM_ARM_PSCI_0_2      2
+-
+-int kvm_psci_version(struct kvm_vcpu *vcpu);
+-int kvm_psci_call(struct kvm_vcpu *vcpu);
+-
+-#endif /* __ARM64_KVM_PSCI_H__ */
+--- a/arch/arm64/kvm/handle_exit.c
++++ b/arch/arm64/kvm/handle_exit.c
+@@ -22,12 +22,15 @@
+ #include <linux/kvm.h>
+ #include <linux/kvm_host.h>
++#include <kvm/arm_psci.h>
++
+ #include <asm/esr.h>
+ #include <asm/kvm_asm.h>
+ #include <asm/kvm_coproc.h>
+ #include <asm/kvm_emulate.h>
+ #include <asm/kvm_mmu.h>
+-#include <asm/kvm_psci.h>
++#include <asm/debug-monitors.h>
++#include <asm/traps.h>
+ #define CREATE_TRACE_POINTS
+ #include "trace.h"
+--- /dev/null
++++ b/include/kvm/arm_psci.h
+@@ -0,0 +1,27 @@
++/*
++ * Copyright (C) 2012,2013 - ARM Ltd
++ * Author: Marc Zyngier <marc.zyngier@arm.com>
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
++ */
++
++#ifndef __KVM_ARM_PSCI_H__
++#define __KVM_ARM_PSCI_H__
++
++#define KVM_ARM_PSCI_0_1      1
++#define KVM_ARM_PSCI_0_2      2
++
++int kvm_psci_version(struct kvm_vcpu *vcpu);
++int kvm_psci_call(struct kvm_vcpu *vcpu);
++
++#endif /* __KVM_ARM_PSCI_H__ */
diff --git a/queue-4.9/arm-arm64-kvm-implement-psci-1.0-support.patch b/queue-4.9/arm-arm64-kvm-implement-psci-1.0-support.patch
new file mode 100644 (file)
index 0000000..9fb95b2
--- /dev/null
@@ -0,0 +1,117 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:27 +0100
+Subject: [PATCH v4.9.y 31/42] arm/arm64: KVM: Implement PSCI 1.0 support
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-32-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 58e0b2239a4d997094ba63986ef4de29ddc91d87 upstream.
+
+PSCI 1.0 can be trivially implemented by providing the FEATURES
+call on top of PSCI 0.2 and returning 1.0 as the PSCI version.
+
+We happily ignore everything else, as they are either optional or
+are clarifications that do not require any additional change.
+
+PSCI 1.0 is now the default until we decide to add a userspace
+selection API.
+
+Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+[v4.9: account for files moved to virt/ upstream]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/kvm/psci.c    |   45 ++++++++++++++++++++++++++++++++++++++++++++-
+ include/kvm/arm_psci.h |    3 +++
+ 2 files changed, 47 insertions(+), 1 deletion(-)
+
+--- a/arch/arm/kvm/psci.c
++++ b/arch/arm/kvm/psci.c
+@@ -232,7 +232,7 @@ static void kvm_psci_system_reset(struct
+ int kvm_psci_version(struct kvm_vcpu *vcpu)
+ {
+       if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features))
+-              return KVM_ARM_PSCI_0_2;
++              return KVM_ARM_PSCI_LATEST;
+       return KVM_ARM_PSCI_0_1;
+ }
+@@ -311,6 +311,47 @@ static int kvm_psci_0_2_call(struct kvm_
+       return ret;
+ }
++static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu)
++{
++      u32 psci_fn = smccc_get_function(vcpu);
++      u32 feature;
++      unsigned long val;
++      int ret = 1;
++
++      switch(psci_fn) {
++      case PSCI_0_2_FN_PSCI_VERSION:
++              val = KVM_ARM_PSCI_1_0;
++              break;
++      case PSCI_1_0_FN_PSCI_FEATURES:
++              feature = smccc_get_arg1(vcpu);
++              switch(feature) {
++              case PSCI_0_2_FN_PSCI_VERSION:
++              case PSCI_0_2_FN_CPU_SUSPEND:
++              case PSCI_0_2_FN64_CPU_SUSPEND:
++              case PSCI_0_2_FN_CPU_OFF:
++              case PSCI_0_2_FN_CPU_ON:
++              case PSCI_0_2_FN64_CPU_ON:
++              case PSCI_0_2_FN_AFFINITY_INFO:
++              case PSCI_0_2_FN64_AFFINITY_INFO:
++              case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
++              case PSCI_0_2_FN_SYSTEM_OFF:
++              case PSCI_0_2_FN_SYSTEM_RESET:
++              case PSCI_1_0_FN_PSCI_FEATURES:
++                      val = 0;
++                      break;
++              default:
++                      val = PSCI_RET_NOT_SUPPORTED;
++                      break;
++              }
++              break;
++      default:
++              return kvm_psci_0_2_call(vcpu);
++      }
++
++      smccc_set_retval(vcpu, val, 0, 0, 0);
++      return ret;
++}
++
+ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
+ {
+       struct kvm *kvm = vcpu->kvm;
+@@ -353,6 +394,8 @@ static int kvm_psci_0_1_call(struct kvm_
+ int kvm_psci_call(struct kvm_vcpu *vcpu)
+ {
+       switch (kvm_psci_version(vcpu)) {
++      case KVM_ARM_PSCI_1_0:
++              return kvm_psci_1_0_call(vcpu);
+       case KVM_ARM_PSCI_0_2:
+               return kvm_psci_0_2_call(vcpu);
+       case KVM_ARM_PSCI_0_1:
+--- a/include/kvm/arm_psci.h
++++ b/include/kvm/arm_psci.h
+@@ -22,6 +22,9 @@
+ #define KVM_ARM_PSCI_0_1      PSCI_VERSION(0, 1)
+ #define KVM_ARM_PSCI_0_2      PSCI_VERSION(0, 2)
++#define KVM_ARM_PSCI_1_0      PSCI_VERSION(1, 0)
++
++#define KVM_ARM_PSCI_LATEST   KVM_ARM_PSCI_1_0
+ int kvm_psci_version(struct kvm_vcpu *vcpu);
+ int kvm_psci_call(struct kvm_vcpu *vcpu);
diff --git a/queue-4.9/arm-arm64-kvm-turn-kvm_psci_version-into-a-static-inline.patch b/queue-4.9/arm-arm64-kvm-turn-kvm_psci_version-into-a-static-inline.patch
new file mode 100644 (file)
index 0000000..4dbf58f
--- /dev/null
@@ -0,0 +1,140 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:30 +0100
+Subject: [PATCH v4.9.y 34/42] arm/arm64: KVM: Turn kvm_psci_version into a static inline
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-35-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit a4097b351118e821841941a79ec77d3ce3f1c5d9 upstream.
+
+We're about to need kvm_psci_version in HYP too. So let's turn it
+into a static inline, and pass the kvm structure as a second
+parameter (so that HYP can do a kern_hyp_va on it).
+
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+[v4.9: account for files moved to virt/ upstream]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/kvm/psci.c         |   12 ++----------
+ arch/arm64/kvm/hyp/switch.c |   18 +++++++++++-------
+ include/kvm/arm_psci.h      |   21 ++++++++++++++++++++-
+ 3 files changed, 33 insertions(+), 18 deletions(-)
+
+--- a/arch/arm/kvm/psci.c
++++ b/arch/arm/kvm/psci.c
+@@ -120,7 +120,7 @@ static unsigned long kvm_psci_vcpu_on(st
+       if (!vcpu)
+               return PSCI_RET_INVALID_PARAMS;
+       if (!vcpu->arch.power_off) {
+-              if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
++              if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1)
+                       return PSCI_RET_ALREADY_ON;
+               else
+                       return PSCI_RET_INVALID_PARAMS;
+@@ -230,14 +230,6 @@ static void kvm_psci_system_reset(struct
+       kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET);
+ }
+-int kvm_psci_version(struct kvm_vcpu *vcpu)
+-{
+-      if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features))
+-              return KVM_ARM_PSCI_LATEST;
+-
+-      return KVM_ARM_PSCI_0_1;
+-}
+-
+ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
+ {
+       struct kvm *kvm = vcpu->kvm;
+@@ -395,7 +387,7 @@ static int kvm_psci_0_1_call(struct kvm_
+  */
+ static int kvm_psci_call(struct kvm_vcpu *vcpu)
+ {
+-      switch (kvm_psci_version(vcpu)) {
++      switch (kvm_psci_version(vcpu, vcpu->kvm)) {
+       case KVM_ARM_PSCI_1_0:
+               return kvm_psci_1_0_call(vcpu);
+       case KVM_ARM_PSCI_0_2:
+--- a/arch/arm64/kvm/hyp/switch.c
++++ b/arch/arm64/kvm/hyp/switch.c
+@@ -19,6 +19,8 @@
+ #include <linux/jump_label.h>
+ #include <uapi/linux/psci.h>
++#include <kvm/arm_psci.h>
++
+ #include <asm/kvm_asm.h>
+ #include <asm/kvm_emulate.h>
+ #include <asm/kvm_hyp.h>
+@@ -311,14 +313,16 @@ again:
+       if (exit_code == ARM_EXCEPTION_TRAP &&
+           (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_HVC64 ||
+-           kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_HVC32) &&
+-          vcpu_get_reg(vcpu, 0) == PSCI_0_2_FN_PSCI_VERSION) {
+-              u64 val = PSCI_RET_NOT_SUPPORTED;
+-              if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features))
+-                      val = 2;
++           kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_HVC32)) {
++              u32 val = vcpu_get_reg(vcpu, 0);
+-              vcpu_set_reg(vcpu, 0, val);
+-              goto again;
++              if (val == PSCI_0_2_FN_PSCI_VERSION) {
++                      val = kvm_psci_version(vcpu, kern_hyp_va(vcpu->kvm));
++                      if (unlikely(val == KVM_ARM_PSCI_0_1))
++                              val = PSCI_RET_NOT_SUPPORTED;
++                      vcpu_set_reg(vcpu, 0, val);
++                      goto again;
++              }
+       }
+       if (static_branch_unlikely(&vgic_v2_cpuif_trap) &&
+--- a/include/kvm/arm_psci.h
++++ b/include/kvm/arm_psci.h
+@@ -18,6 +18,7 @@
+ #ifndef __KVM_ARM_PSCI_H__
+ #define __KVM_ARM_PSCI_H__
++#include <linux/kvm_host.h>
+ #include <uapi/linux/psci.h>
+ #define KVM_ARM_PSCI_0_1      PSCI_VERSION(0, 1)
+@@ -26,7 +27,25 @@
+ #define KVM_ARM_PSCI_LATEST   KVM_ARM_PSCI_1_0
+-int kvm_psci_version(struct kvm_vcpu *vcpu);
++/*
++ * We need the KVM pointer independently from the vcpu as we can call
++ * this from HYP, and need to apply kern_hyp_va on it...
++ */
++static inline int kvm_psci_version(struct kvm_vcpu *vcpu, struct kvm *kvm)
++{
++      /*
++       * Our PSCI implementation stays the same across versions from
++       * v0.2 onward, only adding the few mandatory functions (such
++       * as FEATURES with 1.0) that are required by newer
++       * revisions. It is thus safe to return the latest.
++       */
++      if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features))
++              return KVM_ARM_PSCI_LATEST;
++
++      return KVM_ARM_PSCI_0_1;
++}
++
++
+ int kvm_hvc_call_handler(struct kvm_vcpu *vcpu);
+ #endif /* __KVM_ARM_PSCI_H__ */
diff --git a/queue-4.9/arm-arm64-smccc-implement-smccc-v1.1-inline-primitive.patch b/queue-4.9/arm-arm64-smccc-implement-smccc-v1.1-inline-primitive.patch
new file mode 100644 (file)
index 0000000..e63bc8c
--- /dev/null
@@ -0,0 +1,181 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:36 +0100
+Subject: [PATCH v4.9.y 40/42] arm/arm64: smccc: Implement SMCCC v1.1 inline primitive
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-41-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit f2d3b2e8759a5833df6f022e42df2d581e6d843c upstream.
+
+One of the major improvement of SMCCC v1.1 is that it only clobbers
+the first 4 registers, both on 32 and 64bit. This means that it
+becomes very easy to provide an inline version of the SMC call
+primitive, and avoid performing a function call to stash the
+registers that would otherwise be clobbered by SMCCC v1.0.
+
+Reviewed-by: Robin Murphy <robin.murphy@arm.com>
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/arm-smccc.h |  141 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 141 insertions(+)
+
+--- a/include/linux/arm-smccc.h
++++ b/include/linux/arm-smccc.h
+@@ -150,5 +150,146 @@ asmlinkage void __arm_smccc_hvc(unsigned
+ #define arm_smccc_hvc_quirk(...) __arm_smccc_hvc(__VA_ARGS__)
++/* SMCCC v1.1 implementation madness follows */
++#ifdef CONFIG_ARM64
++
++#define SMCCC_SMC_INST        "smc    #0"
++#define SMCCC_HVC_INST        "hvc    #0"
++
++#elif defined(CONFIG_ARM)
++#include <asm/opcodes-sec.h>
++#include <asm/opcodes-virt.h>
++
++#define SMCCC_SMC_INST        __SMC(0)
++#define SMCCC_HVC_INST        __HVC(0)
++
++#endif
++
++#define ___count_args(_0, _1, _2, _3, _4, _5, _6, _7, _8, x, ...) x
++
++#define __count_args(...)                                             \
++      ___count_args(__VA_ARGS__, 7, 6, 5, 4, 3, 2, 1, 0)
++
++#define __constraint_write_0                                          \
++      "+r" (r0), "=&r" (r1), "=&r" (r2), "=&r" (r3)
++#define __constraint_write_1                                          \
++      "+r" (r0), "+r" (r1), "=&r" (r2), "=&r" (r3)
++#define __constraint_write_2                                          \
++      "+r" (r0), "+r" (r1), "+r" (r2), "=&r" (r3)
++#define __constraint_write_3                                          \
++      "+r" (r0), "+r" (r1), "+r" (r2), "+r" (r3)
++#define __constraint_write_4  __constraint_write_3
++#define __constraint_write_5  __constraint_write_4
++#define __constraint_write_6  __constraint_write_5
++#define __constraint_write_7  __constraint_write_6
++
++#define __constraint_read_0
++#define __constraint_read_1
++#define __constraint_read_2
++#define __constraint_read_3
++#define __constraint_read_4   "r" (r4)
++#define __constraint_read_5   __constraint_read_4, "r" (r5)
++#define __constraint_read_6   __constraint_read_5, "r" (r6)
++#define __constraint_read_7   __constraint_read_6, "r" (r7)
++
++#define __declare_arg_0(a0, res)                                      \
++      struct arm_smccc_res   *___res = res;                           \
++      register u32           r0 asm("r0") = a0;                       \
++      register unsigned long r1 asm("r1");                            \
++      register unsigned long r2 asm("r2");                            \
++      register unsigned long r3 asm("r3")
++
++#define __declare_arg_1(a0, a1, res)                                  \
++      struct arm_smccc_res   *___res = res;                           \
++      register u32           r0 asm("r0") = a0;                       \
++      register typeof(a1)    r1 asm("r1") = a1;                       \
++      register unsigned long r2 asm("r2");                            \
++      register unsigned long r3 asm("r3")
++
++#define __declare_arg_2(a0, a1, a2, res)                              \
++      struct arm_smccc_res   *___res = res;                           \
++      register u32           r0 asm("r0") = a0;                       \
++      register typeof(a1)    r1 asm("r1") = a1;                       \
++      register typeof(a2)    r2 asm("r2") = a2;                       \
++      register unsigned long r3 asm("r3")
++
++#define __declare_arg_3(a0, a1, a2, a3, res)                          \
++      struct arm_smccc_res   *___res = res;                           \
++      register u32           r0 asm("r0") = a0;                       \
++      register typeof(a1)    r1 asm("r1") = a1;                       \
++      register typeof(a2)    r2 asm("r2") = a2;                       \
++      register typeof(a3)    r3 asm("r3") = a3
++
++#define __declare_arg_4(a0, a1, a2, a3, a4, res)                      \
++      __declare_arg_3(a0, a1, a2, a3, res);                           \
++      register typeof(a4) r4 asm("r4") = a4
++
++#define __declare_arg_5(a0, a1, a2, a3, a4, a5, res)                  \
++      __declare_arg_4(a0, a1, a2, a3, a4, res);                       \
++      register typeof(a5) r5 asm("r5") = a5
++
++#define __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res)              \
++      __declare_arg_5(a0, a1, a2, a3, a4, a5, res);                   \
++      register typeof(a6) r6 asm("r6") = a6
++
++#define __declare_arg_7(a0, a1, a2, a3, a4, a5, a6, a7, res)          \
++      __declare_arg_6(a0, a1, a2, a3, a4, a5, a6, res);               \
++      register typeof(a7) r7 asm("r7") = a7
++
++#define ___declare_args(count, ...) __declare_arg_ ## count(__VA_ARGS__)
++#define __declare_args(count, ...)  ___declare_args(count, __VA_ARGS__)
++
++#define ___constraints(count)                                         \
++      : __constraint_write_ ## count                                  \
++      : __constraint_read_ ## count                                   \
++      : "memory"
++#define __constraints(count)  ___constraints(count)
++
++/*
++ * We have an output list that is not necessarily used, and GCC feels
++ * entitled to optimise the whole sequence away. "volatile" is what
++ * makes it stick.
++ */
++#define __arm_smccc_1_1(inst, ...)                                    \
++      do {                                                            \
++              __declare_args(__count_args(__VA_ARGS__), __VA_ARGS__); \
++              asm volatile(inst "\n"                                  \
++                           __constraints(__count_args(__VA_ARGS__))); \
++              if (___res)                                             \
++                      *___res = (typeof(*___res)){r0, r1, r2, r3};    \
++      } while (0)
++
++/*
++ * arm_smccc_1_1_smc() - make an SMCCC v1.1 compliant SMC call
++ *
++ * This is a variadic macro taking one to eight source arguments, and
++ * an optional return structure.
++ *
++ * @a0-a7: arguments passed in registers 0 to 7
++ * @res: result values from registers 0 to 3
++ *
++ * This macro is used to make SMC calls following SMC Calling Convention v1.1.
++ * The content of the supplied param are copied to registers 0 to 7 prior
++ * to the SMC instruction. The return values are updated with the content
++ * from register 0 to 3 on return from the SMC instruction if not NULL.
++ */
++#define arm_smccc_1_1_smc(...)        __arm_smccc_1_1(SMCCC_SMC_INST, __VA_ARGS__)
++
++/*
++ * arm_smccc_1_1_hvc() - make an SMCCC v1.1 compliant HVC call
++ *
++ * This is a variadic macro taking one to eight source arguments, and
++ * an optional return structure.
++ *
++ * @a0-a7: arguments passed in registers 0 to 7
++ * @res: result values from registers 0 to 3
++ *
++ * This macro is used to make HVC calls following SMC Calling Convention v1.1.
++ * The content of the supplied param are copied to registers 0 to 7 prior
++ * to the HVC instruction. The return values are updated with the content
++ * from register 0 to 3 on return from the HVC instruction if not NULL.
++ */
++#define arm_smccc_1_1_hvc(...)        __arm_smccc_1_1(SMCCC_HVC_INST, __VA_ARGS__)
++
+ #endif /*__ASSEMBLY__*/
+ #endif /*__LINUX_ARM_SMCCC_H*/
diff --git a/queue-4.9/arm-arm64-smccc-make-function-identifiers-an-unsigned-quantity.patch b/queue-4.9/arm-arm64-smccc-make-function-identifiers-an-unsigned-quantity.patch
new file mode 100644 (file)
index 0000000..c928b03
--- /dev/null
@@ -0,0 +1,58 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:35 +0100
+Subject: [PATCH v4.9.y 39/42] arm/arm64: smccc: Make function identifiers an unsigned quantity
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-40-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit ded4c39e93f3b72968fdb79baba27f3b83dad34c upstream.
+
+Function identifiers are a 32bit, unsigned quantity. But we never
+tell so to the compiler, resulting in the following:
+
+ 4ac:   b26187e0        mov     x0, #0xffffffff80000001
+
+We thus rely on the firmware narrowing it for us, which is not
+always a reasonable expectation.
+
+Cc: stable@vger.kernel.org
+Reported-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Reviewed-by: Robin Murphy <robin.murphy@arm.com>
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/arm-smccc.h |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/include/linux/arm-smccc.h
++++ b/include/linux/arm-smccc.h
+@@ -14,14 +14,16 @@
+ #ifndef __LINUX_ARM_SMCCC_H
+ #define __LINUX_ARM_SMCCC_H
++#include <uapi/linux/const.h>
++
+ /*
+  * This file provides common defines for ARM SMC Calling Convention as
+  * specified in
+  * http://infocenter.arm.com/help/topic/com.arm.doc.den0028a/index.html
+  */
+-#define ARM_SMCCC_STD_CALL            0
+-#define ARM_SMCCC_FAST_CALL           1
++#define ARM_SMCCC_STD_CALL            _AC(0,U)
++#define ARM_SMCCC_FAST_CALL           _AC(1,U)
+ #define ARM_SMCCC_TYPE_SHIFT          31
+ #define ARM_SMCCC_SMC_32              0
diff --git a/queue-4.9/arm64-add-arm_smccc_arch_workaround_1-bp-hardening-support.patch b/queue-4.9/arm64-add-arm_smccc_arch_workaround_1-bp-hardening-support.patch
new file mode 100644 (file)
index 0000000..dc973da
--- /dev/null
@@ -0,0 +1,3323 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:37 +0100
+Subject: [PATCH v4.9.y 41/42] arm64: Add ARM_SMCCC_ARCH_WORKAROUND_1 BP hardening support
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-42-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit b092201e0020614127f495c092e0a12d26a2116e upstream.
+
+Add the detection and runtime code for ARM_SMCCC_ARCH_WORKAROUND_1.
+It is lovely. Really.
+
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/crypto/sha256-core.S | 2061 ++++++++++++++++++++++++++++++++++++++++
+ arch/arm64/crypto/sha512-core.S | 1085 +++++++++++++++++++++
+ arch/arm64/kernel/bpi.S         |   20 
+ arch/arm64/kernel/cpu_errata.c  |   72 +
+ 4 files changed, 3235 insertions(+), 3 deletions(-)
+ create mode 100644 arch/arm64/crypto/sha256-core.S
+ create mode 100644 arch/arm64/crypto/sha512-core.S
+
+--- /dev/null
++++ b/arch/arm64/crypto/sha256-core.S
+@@ -0,0 +1,2061 @@
++// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
++//
++// Licensed under the OpenSSL license (the "License").  You may not use
++// this file except in compliance with the License.  You can obtain a copy
++// in the file LICENSE in the source distribution or at
++// https://www.openssl.org/source/license.html
++
++// ====================================================================
++// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
++// project. The module is, however, dual licensed under OpenSSL and
++// CRYPTOGAMS licenses depending on where you obtain it. For further
++// details see http://www.openssl.org/~appro/cryptogams/.
++//
++// Permission to use under GPLv2 terms is granted.
++// ====================================================================
++//
++// SHA256/512 for ARMv8.
++//
++// Performance in cycles per processed byte and improvement coefficient
++// over code generated with "default" compiler:
++//
++//            SHA256-hw       SHA256(*)       SHA512
++// Apple A7   1.97            10.5 (+33%)     6.73 (-1%(**))
++// Cortex-A53 2.38            15.5 (+115%)    10.0 (+150%(***))
++// Cortex-A57 2.31            11.6 (+86%)     7.51 (+260%(***))
++// Denver     2.01            10.5 (+26%)     6.70 (+8%)
++// X-Gene                     20.0 (+100%)    12.8 (+300%(***))
++// Mongoose   2.36            13.0 (+50%)     8.36 (+33%)
++//
++// (*)        Software SHA256 results are of lesser relevance, presented
++//    mostly for informational purposes.
++// (**)       The result is a trade-off: it's possible to improve it by
++//    10% (or by 1 cycle per round), but at the cost of 20% loss
++//    on Cortex-A53 (or by 4 cycles per round).
++// (***)      Super-impressive coefficients over gcc-generated code are
++//    indication of some compiler "pathology", most notably code
++//    generated with -mgeneral-regs-only is significanty faster
++//    and the gap is only 40-90%.
++//
++// October 2016.
++//
++// Originally it was reckoned that it makes no sense to implement NEON
++// version of SHA256 for 64-bit processors. This is because performance
++// improvement on most wide-spread Cortex-A5x processors was observed
++// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
++// observed that 32-bit NEON SHA256 performs significantly better than
++// 64-bit scalar version on *some* of the more recent processors. As
++// result 64-bit NEON version of SHA256 was added to provide best
++// all-round performance. For example it executes ~30% faster on X-Gene
++// and Mongoose. [For reference, NEON version of SHA512 is bound to
++// deliver much less improvement, likely *negative* on Cortex-A5x.
++// Which is why NEON support is limited to SHA256.]
++
++#ifndef       __KERNEL__
++# include "arm_arch.h"
++#endif
++
++.text
++
++.extern       OPENSSL_armcap_P
++.globl        sha256_block_data_order
++.type sha256_block_data_order,%function
++.align        6
++sha256_block_data_order:
++#ifndef       __KERNEL__
++# ifdef       __ILP32__
++      ldrsw   x16,.LOPENSSL_armcap_P
++# else
++      ldr     x16,.LOPENSSL_armcap_P
++# endif
++      adr     x17,.LOPENSSL_armcap_P
++      add     x16,x16,x17
++      ldr     w16,[x16]
++      tst     w16,#ARMV8_SHA256
++      b.ne    .Lv8_entry
++      tst     w16,#ARMV7_NEON
++      b.ne    .Lneon_entry
++#endif
++      stp     x29,x30,[sp,#-128]!
++      add     x29,sp,#0
++
++      stp     x19,x20,[sp,#16]
++      stp     x21,x22,[sp,#32]
++      stp     x23,x24,[sp,#48]
++      stp     x25,x26,[sp,#64]
++      stp     x27,x28,[sp,#80]
++      sub     sp,sp,#4*4
++
++      ldp     w20,w21,[x0]                            // load context
++      ldp     w22,w23,[x0,#2*4]
++      ldp     w24,w25,[x0,#4*4]
++      add     x2,x1,x2,lsl#6  // end of input
++      ldp     w26,w27,[x0,#6*4]
++      adr     x30,.LK256
++      stp     x0,x2,[x29,#96]
++
++.Loop:
++      ldp     w3,w4,[x1],#2*4
++      ldr     w19,[x30],#4                    // *K++
++      eor     w28,w21,w22                             // magic seed
++      str     x1,[x29,#112]
++#ifndef       __AARCH64EB__
++      rev     w3,w3                   // 0
++#endif
++      ror     w16,w24,#6
++      add     w27,w27,w19                     // h+=K[i]
++      eor     w6,w24,w24,ror#14
++      and     w17,w25,w24
++      bic     w19,w26,w24
++      add     w27,w27,w3                      // h+=X[i]
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w20,w21                     // a^b, b^c in next round
++      eor     w16,w16,w6,ror#11       // Sigma1(e)
++      ror     w6,w20,#2
++      add     w27,w27,w17                     // h+=Ch(e,f,g)
++      eor     w17,w20,w20,ror#9
++      add     w27,w27,w16                     // h+=Sigma1(e)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      add     w23,w23,w27                     // d+=h
++      eor     w28,w28,w21                     // Maj(a,b,c)
++      eor     w17,w6,w17,ror#13       // Sigma0(a)
++      add     w27,w27,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      //add   w27,w27,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w4,w4                   // 1
++#endif
++      ldp     w5,w6,[x1],#2*4
++      add     w27,w27,w17                     // h+=Sigma0(a)
++      ror     w16,w23,#6
++      add     w26,w26,w28                     // h+=K[i]
++      eor     w7,w23,w23,ror#14
++      and     w17,w24,w23
++      bic     w28,w25,w23
++      add     w26,w26,w4                      // h+=X[i]
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w27,w20                     // a^b, b^c in next round
++      eor     w16,w16,w7,ror#11       // Sigma1(e)
++      ror     w7,w27,#2
++      add     w26,w26,w17                     // h+=Ch(e,f,g)
++      eor     w17,w27,w27,ror#9
++      add     w26,w26,w16                     // h+=Sigma1(e)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      add     w22,w22,w26                     // d+=h
++      eor     w19,w19,w20                     // Maj(a,b,c)
++      eor     w17,w7,w17,ror#13       // Sigma0(a)
++      add     w26,w26,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      //add   w26,w26,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w5,w5                   // 2
++#endif
++      add     w26,w26,w17                     // h+=Sigma0(a)
++      ror     w16,w22,#6
++      add     w25,w25,w19                     // h+=K[i]
++      eor     w8,w22,w22,ror#14
++      and     w17,w23,w22
++      bic     w19,w24,w22
++      add     w25,w25,w5                      // h+=X[i]
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w26,w27                     // a^b, b^c in next round
++      eor     w16,w16,w8,ror#11       // Sigma1(e)
++      ror     w8,w26,#2
++      add     w25,w25,w17                     // h+=Ch(e,f,g)
++      eor     w17,w26,w26,ror#9
++      add     w25,w25,w16                     // h+=Sigma1(e)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      add     w21,w21,w25                     // d+=h
++      eor     w28,w28,w27                     // Maj(a,b,c)
++      eor     w17,w8,w17,ror#13       // Sigma0(a)
++      add     w25,w25,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      //add   w25,w25,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w6,w6                   // 3
++#endif
++      ldp     w7,w8,[x1],#2*4
++      add     w25,w25,w17                     // h+=Sigma0(a)
++      ror     w16,w21,#6
++      add     w24,w24,w28                     // h+=K[i]
++      eor     w9,w21,w21,ror#14
++      and     w17,w22,w21
++      bic     w28,w23,w21
++      add     w24,w24,w6                      // h+=X[i]
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w25,w26                     // a^b, b^c in next round
++      eor     w16,w16,w9,ror#11       // Sigma1(e)
++      ror     w9,w25,#2
++      add     w24,w24,w17                     // h+=Ch(e,f,g)
++      eor     w17,w25,w25,ror#9
++      add     w24,w24,w16                     // h+=Sigma1(e)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      add     w20,w20,w24                     // d+=h
++      eor     w19,w19,w26                     // Maj(a,b,c)
++      eor     w17,w9,w17,ror#13       // Sigma0(a)
++      add     w24,w24,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      //add   w24,w24,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w7,w7                   // 4
++#endif
++      add     w24,w24,w17                     // h+=Sigma0(a)
++      ror     w16,w20,#6
++      add     w23,w23,w19                     // h+=K[i]
++      eor     w10,w20,w20,ror#14
++      and     w17,w21,w20
++      bic     w19,w22,w20
++      add     w23,w23,w7                      // h+=X[i]
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w24,w25                     // a^b, b^c in next round
++      eor     w16,w16,w10,ror#11      // Sigma1(e)
++      ror     w10,w24,#2
++      add     w23,w23,w17                     // h+=Ch(e,f,g)
++      eor     w17,w24,w24,ror#9
++      add     w23,w23,w16                     // h+=Sigma1(e)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      add     w27,w27,w23                     // d+=h
++      eor     w28,w28,w25                     // Maj(a,b,c)
++      eor     w17,w10,w17,ror#13      // Sigma0(a)
++      add     w23,w23,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      //add   w23,w23,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w8,w8                   // 5
++#endif
++      ldp     w9,w10,[x1],#2*4
++      add     w23,w23,w17                     // h+=Sigma0(a)
++      ror     w16,w27,#6
++      add     w22,w22,w28                     // h+=K[i]
++      eor     w11,w27,w27,ror#14
++      and     w17,w20,w27
++      bic     w28,w21,w27
++      add     w22,w22,w8                      // h+=X[i]
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w23,w24                     // a^b, b^c in next round
++      eor     w16,w16,w11,ror#11      // Sigma1(e)
++      ror     w11,w23,#2
++      add     w22,w22,w17                     // h+=Ch(e,f,g)
++      eor     w17,w23,w23,ror#9
++      add     w22,w22,w16                     // h+=Sigma1(e)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      add     w26,w26,w22                     // d+=h
++      eor     w19,w19,w24                     // Maj(a,b,c)
++      eor     w17,w11,w17,ror#13      // Sigma0(a)
++      add     w22,w22,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      //add   w22,w22,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w9,w9                   // 6
++#endif
++      add     w22,w22,w17                     // h+=Sigma0(a)
++      ror     w16,w26,#6
++      add     w21,w21,w19                     // h+=K[i]
++      eor     w12,w26,w26,ror#14
++      and     w17,w27,w26
++      bic     w19,w20,w26
++      add     w21,w21,w9                      // h+=X[i]
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w22,w23                     // a^b, b^c in next round
++      eor     w16,w16,w12,ror#11      // Sigma1(e)
++      ror     w12,w22,#2
++      add     w21,w21,w17                     // h+=Ch(e,f,g)
++      eor     w17,w22,w22,ror#9
++      add     w21,w21,w16                     // h+=Sigma1(e)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      add     w25,w25,w21                     // d+=h
++      eor     w28,w28,w23                     // Maj(a,b,c)
++      eor     w17,w12,w17,ror#13      // Sigma0(a)
++      add     w21,w21,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      //add   w21,w21,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w10,w10                 // 7
++#endif
++      ldp     w11,w12,[x1],#2*4
++      add     w21,w21,w17                     // h+=Sigma0(a)
++      ror     w16,w25,#6
++      add     w20,w20,w28                     // h+=K[i]
++      eor     w13,w25,w25,ror#14
++      and     w17,w26,w25
++      bic     w28,w27,w25
++      add     w20,w20,w10                     // h+=X[i]
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w21,w22                     // a^b, b^c in next round
++      eor     w16,w16,w13,ror#11      // Sigma1(e)
++      ror     w13,w21,#2
++      add     w20,w20,w17                     // h+=Ch(e,f,g)
++      eor     w17,w21,w21,ror#9
++      add     w20,w20,w16                     // h+=Sigma1(e)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      add     w24,w24,w20                     // d+=h
++      eor     w19,w19,w22                     // Maj(a,b,c)
++      eor     w17,w13,w17,ror#13      // Sigma0(a)
++      add     w20,w20,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      //add   w20,w20,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w11,w11                 // 8
++#endif
++      add     w20,w20,w17                     // h+=Sigma0(a)
++      ror     w16,w24,#6
++      add     w27,w27,w19                     // h+=K[i]
++      eor     w14,w24,w24,ror#14
++      and     w17,w25,w24
++      bic     w19,w26,w24
++      add     w27,w27,w11                     // h+=X[i]
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w20,w21                     // a^b, b^c in next round
++      eor     w16,w16,w14,ror#11      // Sigma1(e)
++      ror     w14,w20,#2
++      add     w27,w27,w17                     // h+=Ch(e,f,g)
++      eor     w17,w20,w20,ror#9
++      add     w27,w27,w16                     // h+=Sigma1(e)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      add     w23,w23,w27                     // d+=h
++      eor     w28,w28,w21                     // Maj(a,b,c)
++      eor     w17,w14,w17,ror#13      // Sigma0(a)
++      add     w27,w27,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      //add   w27,w27,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w12,w12                 // 9
++#endif
++      ldp     w13,w14,[x1],#2*4
++      add     w27,w27,w17                     // h+=Sigma0(a)
++      ror     w16,w23,#6
++      add     w26,w26,w28                     // h+=K[i]
++      eor     w15,w23,w23,ror#14
++      and     w17,w24,w23
++      bic     w28,w25,w23
++      add     w26,w26,w12                     // h+=X[i]
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w27,w20                     // a^b, b^c in next round
++      eor     w16,w16,w15,ror#11      // Sigma1(e)
++      ror     w15,w27,#2
++      add     w26,w26,w17                     // h+=Ch(e,f,g)
++      eor     w17,w27,w27,ror#9
++      add     w26,w26,w16                     // h+=Sigma1(e)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      add     w22,w22,w26                     // d+=h
++      eor     w19,w19,w20                     // Maj(a,b,c)
++      eor     w17,w15,w17,ror#13      // Sigma0(a)
++      add     w26,w26,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      //add   w26,w26,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w13,w13                 // 10
++#endif
++      add     w26,w26,w17                     // h+=Sigma0(a)
++      ror     w16,w22,#6
++      add     w25,w25,w19                     // h+=K[i]
++      eor     w0,w22,w22,ror#14
++      and     w17,w23,w22
++      bic     w19,w24,w22
++      add     w25,w25,w13                     // h+=X[i]
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w26,w27                     // a^b, b^c in next round
++      eor     w16,w16,w0,ror#11       // Sigma1(e)
++      ror     w0,w26,#2
++      add     w25,w25,w17                     // h+=Ch(e,f,g)
++      eor     w17,w26,w26,ror#9
++      add     w25,w25,w16                     // h+=Sigma1(e)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      add     w21,w21,w25                     // d+=h
++      eor     w28,w28,w27                     // Maj(a,b,c)
++      eor     w17,w0,w17,ror#13       // Sigma0(a)
++      add     w25,w25,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      //add   w25,w25,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w14,w14                 // 11
++#endif
++      ldp     w15,w0,[x1],#2*4
++      add     w25,w25,w17                     // h+=Sigma0(a)
++      str     w6,[sp,#12]
++      ror     w16,w21,#6
++      add     w24,w24,w28                     // h+=K[i]
++      eor     w6,w21,w21,ror#14
++      and     w17,w22,w21
++      bic     w28,w23,w21
++      add     w24,w24,w14                     // h+=X[i]
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w25,w26                     // a^b, b^c in next round
++      eor     w16,w16,w6,ror#11       // Sigma1(e)
++      ror     w6,w25,#2
++      add     w24,w24,w17                     // h+=Ch(e,f,g)
++      eor     w17,w25,w25,ror#9
++      add     w24,w24,w16                     // h+=Sigma1(e)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      add     w20,w20,w24                     // d+=h
++      eor     w19,w19,w26                     // Maj(a,b,c)
++      eor     w17,w6,w17,ror#13       // Sigma0(a)
++      add     w24,w24,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      //add   w24,w24,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w15,w15                 // 12
++#endif
++      add     w24,w24,w17                     // h+=Sigma0(a)
++      str     w7,[sp,#0]
++      ror     w16,w20,#6
++      add     w23,w23,w19                     // h+=K[i]
++      eor     w7,w20,w20,ror#14
++      and     w17,w21,w20
++      bic     w19,w22,w20
++      add     w23,w23,w15                     // h+=X[i]
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w24,w25                     // a^b, b^c in next round
++      eor     w16,w16,w7,ror#11       // Sigma1(e)
++      ror     w7,w24,#2
++      add     w23,w23,w17                     // h+=Ch(e,f,g)
++      eor     w17,w24,w24,ror#9
++      add     w23,w23,w16                     // h+=Sigma1(e)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      add     w27,w27,w23                     // d+=h
++      eor     w28,w28,w25                     // Maj(a,b,c)
++      eor     w17,w7,w17,ror#13       // Sigma0(a)
++      add     w23,w23,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      //add   w23,w23,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w0,w0                   // 13
++#endif
++      ldp     w1,w2,[x1]
++      add     w23,w23,w17                     // h+=Sigma0(a)
++      str     w8,[sp,#4]
++      ror     w16,w27,#6
++      add     w22,w22,w28                     // h+=K[i]
++      eor     w8,w27,w27,ror#14
++      and     w17,w20,w27
++      bic     w28,w21,w27
++      add     w22,w22,w0                      // h+=X[i]
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w23,w24                     // a^b, b^c in next round
++      eor     w16,w16,w8,ror#11       // Sigma1(e)
++      ror     w8,w23,#2
++      add     w22,w22,w17                     // h+=Ch(e,f,g)
++      eor     w17,w23,w23,ror#9
++      add     w22,w22,w16                     // h+=Sigma1(e)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      add     w26,w26,w22                     // d+=h
++      eor     w19,w19,w24                     // Maj(a,b,c)
++      eor     w17,w8,w17,ror#13       // Sigma0(a)
++      add     w22,w22,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      //add   w22,w22,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w1,w1                   // 14
++#endif
++      ldr     w6,[sp,#12]
++      add     w22,w22,w17                     // h+=Sigma0(a)
++      str     w9,[sp,#8]
++      ror     w16,w26,#6
++      add     w21,w21,w19                     // h+=K[i]
++      eor     w9,w26,w26,ror#14
++      and     w17,w27,w26
++      bic     w19,w20,w26
++      add     w21,w21,w1                      // h+=X[i]
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w22,w23                     // a^b, b^c in next round
++      eor     w16,w16,w9,ror#11       // Sigma1(e)
++      ror     w9,w22,#2
++      add     w21,w21,w17                     // h+=Ch(e,f,g)
++      eor     w17,w22,w22,ror#9
++      add     w21,w21,w16                     // h+=Sigma1(e)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      add     w25,w25,w21                     // d+=h
++      eor     w28,w28,w23                     // Maj(a,b,c)
++      eor     w17,w9,w17,ror#13       // Sigma0(a)
++      add     w21,w21,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      //add   w21,w21,w17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     w2,w2                   // 15
++#endif
++      ldr     w7,[sp,#0]
++      add     w21,w21,w17                     // h+=Sigma0(a)
++      str     w10,[sp,#12]
++      ror     w16,w25,#6
++      add     w20,w20,w28                     // h+=K[i]
++      ror     w9,w4,#7
++      and     w17,w26,w25
++      ror     w8,w1,#17
++      bic     w28,w27,w25
++      ror     w10,w21,#2
++      add     w20,w20,w2                      // h+=X[i]
++      eor     w16,w16,w25,ror#11
++      eor     w9,w9,w4,ror#18
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w21,w22                     // a^b, b^c in next round
++      eor     w16,w16,w25,ror#25      // Sigma1(e)
++      eor     w10,w10,w21,ror#13
++      add     w20,w20,w17                     // h+=Ch(e,f,g)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      eor     w8,w8,w1,ror#19
++      eor     w9,w9,w4,lsr#3  // sigma0(X[i+1])
++      add     w20,w20,w16                     // h+=Sigma1(e)
++      eor     w19,w19,w22                     // Maj(a,b,c)
++      eor     w17,w10,w21,ror#22      // Sigma0(a)
++      eor     w8,w8,w1,lsr#10 // sigma1(X[i+14])
++      add     w3,w3,w12
++      add     w24,w24,w20                     // d+=h
++      add     w20,w20,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      add     w3,w3,w9
++      add     w20,w20,w17                     // h+=Sigma0(a)
++      add     w3,w3,w8
++.Loop_16_xx:
++      ldr     w8,[sp,#4]
++      str     w11,[sp,#0]
++      ror     w16,w24,#6
++      add     w27,w27,w19                     // h+=K[i]
++      ror     w10,w5,#7
++      and     w17,w25,w24
++      ror     w9,w2,#17
++      bic     w19,w26,w24
++      ror     w11,w20,#2
++      add     w27,w27,w3                      // h+=X[i]
++      eor     w16,w16,w24,ror#11
++      eor     w10,w10,w5,ror#18
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w20,w21                     // a^b, b^c in next round
++      eor     w16,w16,w24,ror#25      // Sigma1(e)
++      eor     w11,w11,w20,ror#13
++      add     w27,w27,w17                     // h+=Ch(e,f,g)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      eor     w9,w9,w2,ror#19
++      eor     w10,w10,w5,lsr#3        // sigma0(X[i+1])
++      add     w27,w27,w16                     // h+=Sigma1(e)
++      eor     w28,w28,w21                     // Maj(a,b,c)
++      eor     w17,w11,w20,ror#22      // Sigma0(a)
++      eor     w9,w9,w2,lsr#10 // sigma1(X[i+14])
++      add     w4,w4,w13
++      add     w23,w23,w27                     // d+=h
++      add     w27,w27,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      add     w4,w4,w10
++      add     w27,w27,w17                     // h+=Sigma0(a)
++      add     w4,w4,w9
++      ldr     w9,[sp,#8]
++      str     w12,[sp,#4]
++      ror     w16,w23,#6
++      add     w26,w26,w28                     // h+=K[i]
++      ror     w11,w6,#7
++      and     w17,w24,w23
++      ror     w10,w3,#17
++      bic     w28,w25,w23
++      ror     w12,w27,#2
++      add     w26,w26,w4                      // h+=X[i]
++      eor     w16,w16,w23,ror#11
++      eor     w11,w11,w6,ror#18
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w27,w20                     // a^b, b^c in next round
++      eor     w16,w16,w23,ror#25      // Sigma1(e)
++      eor     w12,w12,w27,ror#13
++      add     w26,w26,w17                     // h+=Ch(e,f,g)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      eor     w10,w10,w3,ror#19
++      eor     w11,w11,w6,lsr#3        // sigma0(X[i+1])
++      add     w26,w26,w16                     // h+=Sigma1(e)
++      eor     w19,w19,w20                     // Maj(a,b,c)
++      eor     w17,w12,w27,ror#22      // Sigma0(a)
++      eor     w10,w10,w3,lsr#10       // sigma1(X[i+14])
++      add     w5,w5,w14
++      add     w22,w22,w26                     // d+=h
++      add     w26,w26,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      add     w5,w5,w11
++      add     w26,w26,w17                     // h+=Sigma0(a)
++      add     w5,w5,w10
++      ldr     w10,[sp,#12]
++      str     w13,[sp,#8]
++      ror     w16,w22,#6
++      add     w25,w25,w19                     // h+=K[i]
++      ror     w12,w7,#7
++      and     w17,w23,w22
++      ror     w11,w4,#17
++      bic     w19,w24,w22
++      ror     w13,w26,#2
++      add     w25,w25,w5                      // h+=X[i]
++      eor     w16,w16,w22,ror#11
++      eor     w12,w12,w7,ror#18
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w26,w27                     // a^b, b^c in next round
++      eor     w16,w16,w22,ror#25      // Sigma1(e)
++      eor     w13,w13,w26,ror#13
++      add     w25,w25,w17                     // h+=Ch(e,f,g)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      eor     w11,w11,w4,ror#19
++      eor     w12,w12,w7,lsr#3        // sigma0(X[i+1])
++      add     w25,w25,w16                     // h+=Sigma1(e)
++      eor     w28,w28,w27                     // Maj(a,b,c)
++      eor     w17,w13,w26,ror#22      // Sigma0(a)
++      eor     w11,w11,w4,lsr#10       // sigma1(X[i+14])
++      add     w6,w6,w15
++      add     w21,w21,w25                     // d+=h
++      add     w25,w25,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      add     w6,w6,w12
++      add     w25,w25,w17                     // h+=Sigma0(a)
++      add     w6,w6,w11
++      ldr     w11,[sp,#0]
++      str     w14,[sp,#12]
++      ror     w16,w21,#6
++      add     w24,w24,w28                     // h+=K[i]
++      ror     w13,w8,#7
++      and     w17,w22,w21
++      ror     w12,w5,#17
++      bic     w28,w23,w21
++      ror     w14,w25,#2
++      add     w24,w24,w6                      // h+=X[i]
++      eor     w16,w16,w21,ror#11
++      eor     w13,w13,w8,ror#18
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w25,w26                     // a^b, b^c in next round
++      eor     w16,w16,w21,ror#25      // Sigma1(e)
++      eor     w14,w14,w25,ror#13
++      add     w24,w24,w17                     // h+=Ch(e,f,g)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      eor     w12,w12,w5,ror#19
++      eor     w13,w13,w8,lsr#3        // sigma0(X[i+1])
++      add     w24,w24,w16                     // h+=Sigma1(e)
++      eor     w19,w19,w26                     // Maj(a,b,c)
++      eor     w17,w14,w25,ror#22      // Sigma0(a)
++      eor     w12,w12,w5,lsr#10       // sigma1(X[i+14])
++      add     w7,w7,w0
++      add     w20,w20,w24                     // d+=h
++      add     w24,w24,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      add     w7,w7,w13
++      add     w24,w24,w17                     // h+=Sigma0(a)
++      add     w7,w7,w12
++      ldr     w12,[sp,#4]
++      str     w15,[sp,#0]
++      ror     w16,w20,#6
++      add     w23,w23,w19                     // h+=K[i]
++      ror     w14,w9,#7
++      and     w17,w21,w20
++      ror     w13,w6,#17
++      bic     w19,w22,w20
++      ror     w15,w24,#2
++      add     w23,w23,w7                      // h+=X[i]
++      eor     w16,w16,w20,ror#11
++      eor     w14,w14,w9,ror#18
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w24,w25                     // a^b, b^c in next round
++      eor     w16,w16,w20,ror#25      // Sigma1(e)
++      eor     w15,w15,w24,ror#13
++      add     w23,w23,w17                     // h+=Ch(e,f,g)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      eor     w13,w13,w6,ror#19
++      eor     w14,w14,w9,lsr#3        // sigma0(X[i+1])
++      add     w23,w23,w16                     // h+=Sigma1(e)
++      eor     w28,w28,w25                     // Maj(a,b,c)
++      eor     w17,w15,w24,ror#22      // Sigma0(a)
++      eor     w13,w13,w6,lsr#10       // sigma1(X[i+14])
++      add     w8,w8,w1
++      add     w27,w27,w23                     // d+=h
++      add     w23,w23,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      add     w8,w8,w14
++      add     w23,w23,w17                     // h+=Sigma0(a)
++      add     w8,w8,w13
++      ldr     w13,[sp,#8]
++      str     w0,[sp,#4]
++      ror     w16,w27,#6
++      add     w22,w22,w28                     // h+=K[i]
++      ror     w15,w10,#7
++      and     w17,w20,w27
++      ror     w14,w7,#17
++      bic     w28,w21,w27
++      ror     w0,w23,#2
++      add     w22,w22,w8                      // h+=X[i]
++      eor     w16,w16,w27,ror#11
++      eor     w15,w15,w10,ror#18
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w23,w24                     // a^b, b^c in next round
++      eor     w16,w16,w27,ror#25      // Sigma1(e)
++      eor     w0,w0,w23,ror#13
++      add     w22,w22,w17                     // h+=Ch(e,f,g)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      eor     w14,w14,w7,ror#19
++      eor     w15,w15,w10,lsr#3       // sigma0(X[i+1])
++      add     w22,w22,w16                     // h+=Sigma1(e)
++      eor     w19,w19,w24                     // Maj(a,b,c)
++      eor     w17,w0,w23,ror#22       // Sigma0(a)
++      eor     w14,w14,w7,lsr#10       // sigma1(X[i+14])
++      add     w9,w9,w2
++      add     w26,w26,w22                     // d+=h
++      add     w22,w22,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      add     w9,w9,w15
++      add     w22,w22,w17                     // h+=Sigma0(a)
++      add     w9,w9,w14
++      ldr     w14,[sp,#12]
++      str     w1,[sp,#8]
++      ror     w16,w26,#6
++      add     w21,w21,w19                     // h+=K[i]
++      ror     w0,w11,#7
++      and     w17,w27,w26
++      ror     w15,w8,#17
++      bic     w19,w20,w26
++      ror     w1,w22,#2
++      add     w21,w21,w9                      // h+=X[i]
++      eor     w16,w16,w26,ror#11
++      eor     w0,w0,w11,ror#18
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w22,w23                     // a^b, b^c in next round
++      eor     w16,w16,w26,ror#25      // Sigma1(e)
++      eor     w1,w1,w22,ror#13
++      add     w21,w21,w17                     // h+=Ch(e,f,g)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      eor     w15,w15,w8,ror#19
++      eor     w0,w0,w11,lsr#3 // sigma0(X[i+1])
++      add     w21,w21,w16                     // h+=Sigma1(e)
++      eor     w28,w28,w23                     // Maj(a,b,c)
++      eor     w17,w1,w22,ror#22       // Sigma0(a)
++      eor     w15,w15,w8,lsr#10       // sigma1(X[i+14])
++      add     w10,w10,w3
++      add     w25,w25,w21                     // d+=h
++      add     w21,w21,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      add     w10,w10,w0
++      add     w21,w21,w17                     // h+=Sigma0(a)
++      add     w10,w10,w15
++      ldr     w15,[sp,#0]
++      str     w2,[sp,#12]
++      ror     w16,w25,#6
++      add     w20,w20,w28                     // h+=K[i]
++      ror     w1,w12,#7
++      and     w17,w26,w25
++      ror     w0,w9,#17
++      bic     w28,w27,w25
++      ror     w2,w21,#2
++      add     w20,w20,w10                     // h+=X[i]
++      eor     w16,w16,w25,ror#11
++      eor     w1,w1,w12,ror#18
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w21,w22                     // a^b, b^c in next round
++      eor     w16,w16,w25,ror#25      // Sigma1(e)
++      eor     w2,w2,w21,ror#13
++      add     w20,w20,w17                     // h+=Ch(e,f,g)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      eor     w0,w0,w9,ror#19
++      eor     w1,w1,w12,lsr#3 // sigma0(X[i+1])
++      add     w20,w20,w16                     // h+=Sigma1(e)
++      eor     w19,w19,w22                     // Maj(a,b,c)
++      eor     w17,w2,w21,ror#22       // Sigma0(a)
++      eor     w0,w0,w9,lsr#10 // sigma1(X[i+14])
++      add     w11,w11,w4
++      add     w24,w24,w20                     // d+=h
++      add     w20,w20,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      add     w11,w11,w1
++      add     w20,w20,w17                     // h+=Sigma0(a)
++      add     w11,w11,w0
++      ldr     w0,[sp,#4]
++      str     w3,[sp,#0]
++      ror     w16,w24,#6
++      add     w27,w27,w19                     // h+=K[i]
++      ror     w2,w13,#7
++      and     w17,w25,w24
++      ror     w1,w10,#17
++      bic     w19,w26,w24
++      ror     w3,w20,#2
++      add     w27,w27,w11                     // h+=X[i]
++      eor     w16,w16,w24,ror#11
++      eor     w2,w2,w13,ror#18
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w20,w21                     // a^b, b^c in next round
++      eor     w16,w16,w24,ror#25      // Sigma1(e)
++      eor     w3,w3,w20,ror#13
++      add     w27,w27,w17                     // h+=Ch(e,f,g)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      eor     w1,w1,w10,ror#19
++      eor     w2,w2,w13,lsr#3 // sigma0(X[i+1])
++      add     w27,w27,w16                     // h+=Sigma1(e)
++      eor     w28,w28,w21                     // Maj(a,b,c)
++      eor     w17,w3,w20,ror#22       // Sigma0(a)
++      eor     w1,w1,w10,lsr#10        // sigma1(X[i+14])
++      add     w12,w12,w5
++      add     w23,w23,w27                     // d+=h
++      add     w27,w27,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      add     w12,w12,w2
++      add     w27,w27,w17                     // h+=Sigma0(a)
++      add     w12,w12,w1
++      ldr     w1,[sp,#8]
++      str     w4,[sp,#4]
++      ror     w16,w23,#6
++      add     w26,w26,w28                     // h+=K[i]
++      ror     w3,w14,#7
++      and     w17,w24,w23
++      ror     w2,w11,#17
++      bic     w28,w25,w23
++      ror     w4,w27,#2
++      add     w26,w26,w12                     // h+=X[i]
++      eor     w16,w16,w23,ror#11
++      eor     w3,w3,w14,ror#18
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w27,w20                     // a^b, b^c in next round
++      eor     w16,w16,w23,ror#25      // Sigma1(e)
++      eor     w4,w4,w27,ror#13
++      add     w26,w26,w17                     // h+=Ch(e,f,g)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      eor     w2,w2,w11,ror#19
++      eor     w3,w3,w14,lsr#3 // sigma0(X[i+1])
++      add     w26,w26,w16                     // h+=Sigma1(e)
++      eor     w19,w19,w20                     // Maj(a,b,c)
++      eor     w17,w4,w27,ror#22       // Sigma0(a)
++      eor     w2,w2,w11,lsr#10        // sigma1(X[i+14])
++      add     w13,w13,w6
++      add     w22,w22,w26                     // d+=h
++      add     w26,w26,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      add     w13,w13,w3
++      add     w26,w26,w17                     // h+=Sigma0(a)
++      add     w13,w13,w2
++      ldr     w2,[sp,#12]
++      str     w5,[sp,#8]
++      ror     w16,w22,#6
++      add     w25,w25,w19                     // h+=K[i]
++      ror     w4,w15,#7
++      and     w17,w23,w22
++      ror     w3,w12,#17
++      bic     w19,w24,w22
++      ror     w5,w26,#2
++      add     w25,w25,w13                     // h+=X[i]
++      eor     w16,w16,w22,ror#11
++      eor     w4,w4,w15,ror#18
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w26,w27                     // a^b, b^c in next round
++      eor     w16,w16,w22,ror#25      // Sigma1(e)
++      eor     w5,w5,w26,ror#13
++      add     w25,w25,w17                     // h+=Ch(e,f,g)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      eor     w3,w3,w12,ror#19
++      eor     w4,w4,w15,lsr#3 // sigma0(X[i+1])
++      add     w25,w25,w16                     // h+=Sigma1(e)
++      eor     w28,w28,w27                     // Maj(a,b,c)
++      eor     w17,w5,w26,ror#22       // Sigma0(a)
++      eor     w3,w3,w12,lsr#10        // sigma1(X[i+14])
++      add     w14,w14,w7
++      add     w21,w21,w25                     // d+=h
++      add     w25,w25,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      add     w14,w14,w4
++      add     w25,w25,w17                     // h+=Sigma0(a)
++      add     w14,w14,w3
++      ldr     w3,[sp,#0]
++      str     w6,[sp,#12]
++      ror     w16,w21,#6
++      add     w24,w24,w28                     // h+=K[i]
++      ror     w5,w0,#7
++      and     w17,w22,w21
++      ror     w4,w13,#17
++      bic     w28,w23,w21
++      ror     w6,w25,#2
++      add     w24,w24,w14                     // h+=X[i]
++      eor     w16,w16,w21,ror#11
++      eor     w5,w5,w0,ror#18
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w25,w26                     // a^b, b^c in next round
++      eor     w16,w16,w21,ror#25      // Sigma1(e)
++      eor     w6,w6,w25,ror#13
++      add     w24,w24,w17                     // h+=Ch(e,f,g)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      eor     w4,w4,w13,ror#19
++      eor     w5,w5,w0,lsr#3  // sigma0(X[i+1])
++      add     w24,w24,w16                     // h+=Sigma1(e)
++      eor     w19,w19,w26                     // Maj(a,b,c)
++      eor     w17,w6,w25,ror#22       // Sigma0(a)
++      eor     w4,w4,w13,lsr#10        // sigma1(X[i+14])
++      add     w15,w15,w8
++      add     w20,w20,w24                     // d+=h
++      add     w24,w24,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      add     w15,w15,w5
++      add     w24,w24,w17                     // h+=Sigma0(a)
++      add     w15,w15,w4
++      ldr     w4,[sp,#4]
++      str     w7,[sp,#0]
++      ror     w16,w20,#6
++      add     w23,w23,w19                     // h+=K[i]
++      ror     w6,w1,#7
++      and     w17,w21,w20
++      ror     w5,w14,#17
++      bic     w19,w22,w20
++      ror     w7,w24,#2
++      add     w23,w23,w15                     // h+=X[i]
++      eor     w16,w16,w20,ror#11
++      eor     w6,w6,w1,ror#18
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w24,w25                     // a^b, b^c in next round
++      eor     w16,w16,w20,ror#25      // Sigma1(e)
++      eor     w7,w7,w24,ror#13
++      add     w23,w23,w17                     // h+=Ch(e,f,g)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      eor     w5,w5,w14,ror#19
++      eor     w6,w6,w1,lsr#3  // sigma0(X[i+1])
++      add     w23,w23,w16                     // h+=Sigma1(e)
++      eor     w28,w28,w25                     // Maj(a,b,c)
++      eor     w17,w7,w24,ror#22       // Sigma0(a)
++      eor     w5,w5,w14,lsr#10        // sigma1(X[i+14])
++      add     w0,w0,w9
++      add     w27,w27,w23                     // d+=h
++      add     w23,w23,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      add     w0,w0,w6
++      add     w23,w23,w17                     // h+=Sigma0(a)
++      add     w0,w0,w5
++      ldr     w5,[sp,#8]
++      str     w8,[sp,#4]
++      ror     w16,w27,#6
++      add     w22,w22,w28                     // h+=K[i]
++      ror     w7,w2,#7
++      and     w17,w20,w27
++      ror     w6,w15,#17
++      bic     w28,w21,w27
++      ror     w8,w23,#2
++      add     w22,w22,w0                      // h+=X[i]
++      eor     w16,w16,w27,ror#11
++      eor     w7,w7,w2,ror#18
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w23,w24                     // a^b, b^c in next round
++      eor     w16,w16,w27,ror#25      // Sigma1(e)
++      eor     w8,w8,w23,ror#13
++      add     w22,w22,w17                     // h+=Ch(e,f,g)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      eor     w6,w6,w15,ror#19
++      eor     w7,w7,w2,lsr#3  // sigma0(X[i+1])
++      add     w22,w22,w16                     // h+=Sigma1(e)
++      eor     w19,w19,w24                     // Maj(a,b,c)
++      eor     w17,w8,w23,ror#22       // Sigma0(a)
++      eor     w6,w6,w15,lsr#10        // sigma1(X[i+14])
++      add     w1,w1,w10
++      add     w26,w26,w22                     // d+=h
++      add     w22,w22,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      add     w1,w1,w7
++      add     w22,w22,w17                     // h+=Sigma0(a)
++      add     w1,w1,w6
++      ldr     w6,[sp,#12]
++      str     w9,[sp,#8]
++      ror     w16,w26,#6
++      add     w21,w21,w19                     // h+=K[i]
++      ror     w8,w3,#7
++      and     w17,w27,w26
++      ror     w7,w0,#17
++      bic     w19,w20,w26
++      ror     w9,w22,#2
++      add     w21,w21,w1                      // h+=X[i]
++      eor     w16,w16,w26,ror#11
++      eor     w8,w8,w3,ror#18
++      orr     w17,w17,w19                     // Ch(e,f,g)
++      eor     w19,w22,w23                     // a^b, b^c in next round
++      eor     w16,w16,w26,ror#25      // Sigma1(e)
++      eor     w9,w9,w22,ror#13
++      add     w21,w21,w17                     // h+=Ch(e,f,g)
++      and     w28,w28,w19                     // (b^c)&=(a^b)
++      eor     w7,w7,w0,ror#19
++      eor     w8,w8,w3,lsr#3  // sigma0(X[i+1])
++      add     w21,w21,w16                     // h+=Sigma1(e)
++      eor     w28,w28,w23                     // Maj(a,b,c)
++      eor     w17,w9,w22,ror#22       // Sigma0(a)
++      eor     w7,w7,w0,lsr#10 // sigma1(X[i+14])
++      add     w2,w2,w11
++      add     w25,w25,w21                     // d+=h
++      add     w21,w21,w28                     // h+=Maj(a,b,c)
++      ldr     w28,[x30],#4            // *K++, w19 in next round
++      add     w2,w2,w8
++      add     w21,w21,w17                     // h+=Sigma0(a)
++      add     w2,w2,w7
++      ldr     w7,[sp,#0]
++      str     w10,[sp,#12]
++      ror     w16,w25,#6
++      add     w20,w20,w28                     // h+=K[i]
++      ror     w9,w4,#7
++      and     w17,w26,w25
++      ror     w8,w1,#17
++      bic     w28,w27,w25
++      ror     w10,w21,#2
++      add     w20,w20,w2                      // h+=X[i]
++      eor     w16,w16,w25,ror#11
++      eor     w9,w9,w4,ror#18
++      orr     w17,w17,w28                     // Ch(e,f,g)
++      eor     w28,w21,w22                     // a^b, b^c in next round
++      eor     w16,w16,w25,ror#25      // Sigma1(e)
++      eor     w10,w10,w21,ror#13
++      add     w20,w20,w17                     // h+=Ch(e,f,g)
++      and     w19,w19,w28                     // (b^c)&=(a^b)
++      eor     w8,w8,w1,ror#19
++      eor     w9,w9,w4,lsr#3  // sigma0(X[i+1])
++      add     w20,w20,w16                     // h+=Sigma1(e)
++      eor     w19,w19,w22                     // Maj(a,b,c)
++      eor     w17,w10,w21,ror#22      // Sigma0(a)
++      eor     w8,w8,w1,lsr#10 // sigma1(X[i+14])
++      add     w3,w3,w12
++      add     w24,w24,w20                     // d+=h
++      add     w20,w20,w19                     // h+=Maj(a,b,c)
++      ldr     w19,[x30],#4            // *K++, w28 in next round
++      add     w3,w3,w9
++      add     w20,w20,w17                     // h+=Sigma0(a)
++      add     w3,w3,w8
++      cbnz    w19,.Loop_16_xx
++
++      ldp     x0,x2,[x29,#96]
++      ldr     x1,[x29,#112]
++      sub     x30,x30,#260            // rewind
++
++      ldp     w3,w4,[x0]
++      ldp     w5,w6,[x0,#2*4]
++      add     x1,x1,#14*4                     // advance input pointer
++      ldp     w7,w8,[x0,#4*4]
++      add     w20,w20,w3
++      ldp     w9,w10,[x0,#6*4]
++      add     w21,w21,w4
++      add     w22,w22,w5
++      add     w23,w23,w6
++      stp     w20,w21,[x0]
++      add     w24,w24,w7
++      add     w25,w25,w8
++      stp     w22,w23,[x0,#2*4]
++      add     w26,w26,w9
++      add     w27,w27,w10
++      cmp     x1,x2
++      stp     w24,w25,[x0,#4*4]
++      stp     w26,w27,[x0,#6*4]
++      b.ne    .Loop
++
++      ldp     x19,x20,[x29,#16]
++      add     sp,sp,#4*4
++      ldp     x21,x22,[x29,#32]
++      ldp     x23,x24,[x29,#48]
++      ldp     x25,x26,[x29,#64]
++      ldp     x27,x28,[x29,#80]
++      ldp     x29,x30,[sp],#128
++      ret
++.size sha256_block_data_order,.-sha256_block_data_order
++
++.align        6
++.type .LK256,%object
++.LK256:
++      .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
++      .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
++      .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
++      .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
++      .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
++      .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
++      .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
++      .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
++      .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
++      .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
++      .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
++      .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
++      .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
++      .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
++      .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
++      .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
++      .long   0       //terminator
++.size .LK256,.-.LK256
++#ifndef       __KERNEL__
++.align        3
++.LOPENSSL_armcap_P:
++# ifdef       __ILP32__
++      .long   OPENSSL_armcap_P-.
++# else
++      .quad   OPENSSL_armcap_P-.
++# endif
++#endif
++.asciz        "SHA256 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
++.align        2
++#ifndef       __KERNEL__
++.type sha256_block_armv8,%function
++.align        6
++sha256_block_armv8:
++.Lv8_entry:
++      stp             x29,x30,[sp,#-16]!
++      add             x29,sp,#0
++
++      ld1             {v0.4s,v1.4s},[x0]
++      adr             x3,.LK256
++
++.Loop_hw:
++      ld1             {v4.16b-v7.16b},[x1],#64
++      sub             x2,x2,#1
++      ld1             {v16.4s},[x3],#16
++      rev32           v4.16b,v4.16b
++      rev32           v5.16b,v5.16b
++      rev32           v6.16b,v6.16b
++      rev32           v7.16b,v7.16b
++      orr             v18.16b,v0.16b,v0.16b           // offload
++      orr             v19.16b,v1.16b,v1.16b
++      ld1             {v17.4s},[x3],#16
++      add             v16.4s,v16.4s,v4.4s
++      .inst   0x5e2828a4      //sha256su0 v4.16b,v5.16b
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
++      .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
++      .inst   0x5e0760c4      //sha256su1 v4.16b,v6.16b,v7.16b
++      ld1             {v16.4s},[x3],#16
++      add             v17.4s,v17.4s,v5.4s
++      .inst   0x5e2828c5      //sha256su0 v5.16b,v6.16b
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
++      .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
++      .inst   0x5e0460e5      //sha256su1 v5.16b,v7.16b,v4.16b
++      ld1             {v17.4s},[x3],#16
++      add             v16.4s,v16.4s,v6.4s
++      .inst   0x5e2828e6      //sha256su0 v6.16b,v7.16b
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
++      .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
++      .inst   0x5e056086      //sha256su1 v6.16b,v4.16b,v5.16b
++      ld1             {v16.4s},[x3],#16
++      add             v17.4s,v17.4s,v7.4s
++      .inst   0x5e282887      //sha256su0 v7.16b,v4.16b
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
++      .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
++      .inst   0x5e0660a7      //sha256su1 v7.16b,v5.16b,v6.16b
++      ld1             {v17.4s},[x3],#16
++      add             v16.4s,v16.4s,v4.4s
++      .inst   0x5e2828a4      //sha256su0 v4.16b,v5.16b
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
++      .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
++      .inst   0x5e0760c4      //sha256su1 v4.16b,v6.16b,v7.16b
++      ld1             {v16.4s},[x3],#16
++      add             v17.4s,v17.4s,v5.4s
++      .inst   0x5e2828c5      //sha256su0 v5.16b,v6.16b
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
++      .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
++      .inst   0x5e0460e5      //sha256su1 v5.16b,v7.16b,v4.16b
++      ld1             {v17.4s},[x3],#16
++      add             v16.4s,v16.4s,v6.4s
++      .inst   0x5e2828e6      //sha256su0 v6.16b,v7.16b
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
++      .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
++      .inst   0x5e056086      //sha256su1 v6.16b,v4.16b,v5.16b
++      ld1             {v16.4s},[x3],#16
++      add             v17.4s,v17.4s,v7.4s
++      .inst   0x5e282887      //sha256su0 v7.16b,v4.16b
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
++      .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
++      .inst   0x5e0660a7      //sha256su1 v7.16b,v5.16b,v6.16b
++      ld1             {v17.4s},[x3],#16
++      add             v16.4s,v16.4s,v4.4s
++      .inst   0x5e2828a4      //sha256su0 v4.16b,v5.16b
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
++      .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
++      .inst   0x5e0760c4      //sha256su1 v4.16b,v6.16b,v7.16b
++      ld1             {v16.4s},[x3],#16
++      add             v17.4s,v17.4s,v5.4s
++      .inst   0x5e2828c5      //sha256su0 v5.16b,v6.16b
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
++      .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
++      .inst   0x5e0460e5      //sha256su1 v5.16b,v7.16b,v4.16b
++      ld1             {v17.4s},[x3],#16
++      add             v16.4s,v16.4s,v6.4s
++      .inst   0x5e2828e6      //sha256su0 v6.16b,v7.16b
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
++      .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
++      .inst   0x5e056086      //sha256su1 v6.16b,v4.16b,v5.16b
++      ld1             {v16.4s},[x3],#16
++      add             v17.4s,v17.4s,v7.4s
++      .inst   0x5e282887      //sha256su0 v7.16b,v4.16b
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
++      .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
++      .inst   0x5e0660a7      //sha256su1 v7.16b,v5.16b,v6.16b
++      ld1             {v17.4s},[x3],#16
++      add             v16.4s,v16.4s,v4.4s
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
++      .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
++
++      ld1             {v16.4s},[x3],#16
++      add             v17.4s,v17.4s,v5.4s
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
++      .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
++
++      ld1             {v17.4s},[x3]
++      add             v16.4s,v16.4s,v6.4s
++      sub             x3,x3,#64*4-16  // rewind
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e104020      //sha256h v0.16b,v1.16b,v16.4s
++      .inst   0x5e105041      //sha256h2 v1.16b,v2.16b,v16.4s
++
++      add             v17.4s,v17.4s,v7.4s
++      orr             v2.16b,v0.16b,v0.16b
++      .inst   0x5e114020      //sha256h v0.16b,v1.16b,v17.4s
++      .inst   0x5e115041      //sha256h2 v1.16b,v2.16b,v17.4s
++
++      add             v0.4s,v0.4s,v18.4s
++      add             v1.4s,v1.4s,v19.4s
++
++      cbnz            x2,.Loop_hw
++
++      st1             {v0.4s,v1.4s},[x0]
++
++      ldr             x29,[sp],#16
++      ret
++.size sha256_block_armv8,.-sha256_block_armv8
++#endif
++#ifdef        __KERNEL__
++.globl        sha256_block_neon
++#endif
++.type sha256_block_neon,%function
++.align        4
++sha256_block_neon:
++.Lneon_entry:
++      stp     x29, x30, [sp, #-16]!
++      mov     x29, sp
++      sub     sp,sp,#16*4
++
++      adr     x16,.LK256
++      add     x2,x1,x2,lsl#6  // len to point at the end of inp
++
++      ld1     {v0.16b},[x1], #16
++      ld1     {v1.16b},[x1], #16
++      ld1     {v2.16b},[x1], #16
++      ld1     {v3.16b},[x1], #16
++      ld1     {v4.4s},[x16], #16
++      ld1     {v5.4s},[x16], #16
++      ld1     {v6.4s},[x16], #16
++      ld1     {v7.4s},[x16], #16
++      rev32   v0.16b,v0.16b           // yes, even on
++      rev32   v1.16b,v1.16b           // big-endian
++      rev32   v2.16b,v2.16b
++      rev32   v3.16b,v3.16b
++      mov     x17,sp
++      add     v4.4s,v4.4s,v0.4s
++      add     v5.4s,v5.4s,v1.4s
++      add     v6.4s,v6.4s,v2.4s
++      st1     {v4.4s-v5.4s},[x17], #32
++      add     v7.4s,v7.4s,v3.4s
++      st1     {v6.4s-v7.4s},[x17]
++      sub     x17,x17,#32
++
++      ldp     w3,w4,[x0]
++      ldp     w5,w6,[x0,#8]
++      ldp     w7,w8,[x0,#16]
++      ldp     w9,w10,[x0,#24]
++      ldr     w12,[sp,#0]
++      mov     w13,wzr
++      eor     w14,w4,w5
++      mov     w15,wzr
++      b       .L_00_48
++
++.align        4
++.L_00_48:
++      ext     v4.16b,v0.16b,v1.16b,#4
++      add     w10,w10,w12
++      add     w3,w3,w15
++      and     w12,w8,w7
++      bic     w15,w9,w7
++      ext     v7.16b,v2.16b,v3.16b,#4
++      eor     w11,w7,w7,ror#5
++      add     w3,w3,w13
++      mov     d19,v3.d[1]
++      orr     w12,w12,w15
++      eor     w11,w11,w7,ror#19
++      ushr    v6.4s,v4.4s,#7
++      eor     w15,w3,w3,ror#11
++      ushr    v5.4s,v4.4s,#3
++      add     w10,w10,w12
++      add     v0.4s,v0.4s,v7.4s
++      ror     w11,w11,#6
++      sli     v6.4s,v4.4s,#25
++      eor     w13,w3,w4
++      eor     w15,w15,w3,ror#20
++      ushr    v7.4s,v4.4s,#18
++      add     w10,w10,w11
++      ldr     w12,[sp,#4]
++      and     w14,w14,w13
++      eor     v5.16b,v5.16b,v6.16b
++      ror     w15,w15,#2
++      add     w6,w6,w10
++      sli     v7.4s,v4.4s,#14
++      eor     w14,w14,w4
++      ushr    v16.4s,v19.4s,#17
++      add     w9,w9,w12
++      add     w10,w10,w15
++      and     w12,w7,w6
++      eor     v5.16b,v5.16b,v7.16b
++      bic     w15,w8,w6
++      eor     w11,w6,w6,ror#5
++      sli     v16.4s,v19.4s,#15
++      add     w10,w10,w14
++      orr     w12,w12,w15
++      ushr    v17.4s,v19.4s,#10
++      eor     w11,w11,w6,ror#19
++      eor     w15,w10,w10,ror#11
++      ushr    v7.4s,v19.4s,#19
++      add     w9,w9,w12
++      ror     w11,w11,#6
++      add     v0.4s,v0.4s,v5.4s
++      eor     w14,w10,w3
++      eor     w15,w15,w10,ror#20
++      sli     v7.4s,v19.4s,#13
++      add     w9,w9,w11
++      ldr     w12,[sp,#8]
++      and     w13,w13,w14
++      eor     v17.16b,v17.16b,v16.16b
++      ror     w15,w15,#2
++      add     w5,w5,w9
++      eor     w13,w13,w3
++      eor     v17.16b,v17.16b,v7.16b
++      add     w8,w8,w12
++      add     w9,w9,w15
++      and     w12,w6,w5
++      add     v0.4s,v0.4s,v17.4s
++      bic     w15,w7,w5
++      eor     w11,w5,w5,ror#5
++      add     w9,w9,w13
++      ushr    v18.4s,v0.4s,#17
++      orr     w12,w12,w15
++      ushr    v19.4s,v0.4s,#10
++      eor     w11,w11,w5,ror#19
++      eor     w15,w9,w9,ror#11
++      sli     v18.4s,v0.4s,#15
++      add     w8,w8,w12
++      ushr    v17.4s,v0.4s,#19
++      ror     w11,w11,#6
++      eor     w13,w9,w10
++      eor     v19.16b,v19.16b,v18.16b
++      eor     w15,w15,w9,ror#20
++      add     w8,w8,w11
++      sli     v17.4s,v0.4s,#13
++      ldr     w12,[sp,#12]
++      and     w14,w14,w13
++      ror     w15,w15,#2
++      ld1     {v4.4s},[x16], #16
++      add     w4,w4,w8
++      eor     v19.16b,v19.16b,v17.16b
++      eor     w14,w14,w10
++      eor     v17.16b,v17.16b,v17.16b
++      add     w7,w7,w12
++      add     w8,w8,w15
++      and     w12,w5,w4
++      mov     v17.d[1],v19.d[0]
++      bic     w15,w6,w4
++      eor     w11,w4,w4,ror#5
++      add     w8,w8,w14
++      add     v0.4s,v0.4s,v17.4s
++      orr     w12,w12,w15
++      eor     w11,w11,w4,ror#19
++      eor     w15,w8,w8,ror#11
++      add     v4.4s,v4.4s,v0.4s
++      add     w7,w7,w12
++      ror     w11,w11,#6
++      eor     w14,w8,w9
++      eor     w15,w15,w8,ror#20
++      add     w7,w7,w11
++      ldr     w12,[sp,#16]
++      and     w13,w13,w14
++      ror     w15,w15,#2
++      add     w3,w3,w7
++      eor     w13,w13,w9
++      st1     {v4.4s},[x17], #16
++      ext     v4.16b,v1.16b,v2.16b,#4
++      add     w6,w6,w12
++      add     w7,w7,w15
++      and     w12,w4,w3
++      bic     w15,w5,w3
++      ext     v7.16b,v3.16b,v0.16b,#4
++      eor     w11,w3,w3,ror#5
++      add     w7,w7,w13
++      mov     d19,v0.d[1]
++      orr     w12,w12,w15
++      eor     w11,w11,w3,ror#19
++      ushr    v6.4s,v4.4s,#7
++      eor     w15,w7,w7,ror#11
++      ushr    v5.4s,v4.4s,#3
++      add     w6,w6,w12
++      add     v1.4s,v1.4s,v7.4s
++      ror     w11,w11,#6
++      sli     v6.4s,v4.4s,#25
++      eor     w13,w7,w8
++      eor     w15,w15,w7,ror#20
++      ushr    v7.4s,v4.4s,#18
++      add     w6,w6,w11
++      ldr     w12,[sp,#20]
++      and     w14,w14,w13
++      eor     v5.16b,v5.16b,v6.16b
++      ror     w15,w15,#2
++      add     w10,w10,w6
++      sli     v7.4s,v4.4s,#14
++      eor     w14,w14,w8
++      ushr    v16.4s,v19.4s,#17
++      add     w5,w5,w12
++      add     w6,w6,w15
++      and     w12,w3,w10
++      eor     v5.16b,v5.16b,v7.16b
++      bic     w15,w4,w10
++      eor     w11,w10,w10,ror#5
++      sli     v16.4s,v19.4s,#15
++      add     w6,w6,w14
++      orr     w12,w12,w15
++      ushr    v17.4s,v19.4s,#10
++      eor     w11,w11,w10,ror#19
++      eor     w15,w6,w6,ror#11
++      ushr    v7.4s,v19.4s,#19
++      add     w5,w5,w12
++      ror     w11,w11,#6
++      add     v1.4s,v1.4s,v5.4s
++      eor     w14,w6,w7
++      eor     w15,w15,w6,ror#20
++      sli     v7.4s,v19.4s,#13
++      add     w5,w5,w11
++      ldr     w12,[sp,#24]
++      and     w13,w13,w14
++      eor     v17.16b,v17.16b,v16.16b
++      ror     w15,w15,#2
++      add     w9,w9,w5
++      eor     w13,w13,w7
++      eor     v17.16b,v17.16b,v7.16b
++      add     w4,w4,w12
++      add     w5,w5,w15
++      and     w12,w10,w9
++      add     v1.4s,v1.4s,v17.4s
++      bic     w15,w3,w9
++      eor     w11,w9,w9,ror#5
++      add     w5,w5,w13
++      ushr    v18.4s,v1.4s,#17
++      orr     w12,w12,w15
++      ushr    v19.4s,v1.4s,#10
++      eor     w11,w11,w9,ror#19
++      eor     w15,w5,w5,ror#11
++      sli     v18.4s,v1.4s,#15
++      add     w4,w4,w12
++      ushr    v17.4s,v1.4s,#19
++      ror     w11,w11,#6
++      eor     w13,w5,w6
++      eor     v19.16b,v19.16b,v18.16b
++      eor     w15,w15,w5,ror#20
++      add     w4,w4,w11
++      sli     v17.4s,v1.4s,#13
++      ldr     w12,[sp,#28]
++      and     w14,w14,w13
++      ror     w15,w15,#2
++      ld1     {v4.4s},[x16], #16
++      add     w8,w8,w4
++      eor     v19.16b,v19.16b,v17.16b
++      eor     w14,w14,w6
++      eor     v17.16b,v17.16b,v17.16b
++      add     w3,w3,w12
++      add     w4,w4,w15
++      and     w12,w9,w8
++      mov     v17.d[1],v19.d[0]
++      bic     w15,w10,w8
++      eor     w11,w8,w8,ror#5
++      add     w4,w4,w14
++      add     v1.4s,v1.4s,v17.4s
++      orr     w12,w12,w15
++      eor     w11,w11,w8,ror#19
++      eor     w15,w4,w4,ror#11
++      add     v4.4s,v4.4s,v1.4s
++      add     w3,w3,w12
++      ror     w11,w11,#6
++      eor     w14,w4,w5
++      eor     w15,w15,w4,ror#20
++      add     w3,w3,w11
++      ldr     w12,[sp,#32]
++      and     w13,w13,w14
++      ror     w15,w15,#2
++      add     w7,w7,w3
++      eor     w13,w13,w5
++      st1     {v4.4s},[x17], #16
++      ext     v4.16b,v2.16b,v3.16b,#4
++      add     w10,w10,w12
++      add     w3,w3,w15
++      and     w12,w8,w7
++      bic     w15,w9,w7
++      ext     v7.16b,v0.16b,v1.16b,#4
++      eor     w11,w7,w7,ror#5
++      add     w3,w3,w13
++      mov     d19,v1.d[1]
++      orr     w12,w12,w15
++      eor     w11,w11,w7,ror#19
++      ushr    v6.4s,v4.4s,#7
++      eor     w15,w3,w3,ror#11
++      ushr    v5.4s,v4.4s,#3
++      add     w10,w10,w12
++      add     v2.4s,v2.4s,v7.4s
++      ror     w11,w11,#6
++      sli     v6.4s,v4.4s,#25
++      eor     w13,w3,w4
++      eor     w15,w15,w3,ror#20
++      ushr    v7.4s,v4.4s,#18
++      add     w10,w10,w11
++      ldr     w12,[sp,#36]
++      and     w14,w14,w13
++      eor     v5.16b,v5.16b,v6.16b
++      ror     w15,w15,#2
++      add     w6,w6,w10
++      sli     v7.4s,v4.4s,#14
++      eor     w14,w14,w4
++      ushr    v16.4s,v19.4s,#17
++      add     w9,w9,w12
++      add     w10,w10,w15
++      and     w12,w7,w6
++      eor     v5.16b,v5.16b,v7.16b
++      bic     w15,w8,w6
++      eor     w11,w6,w6,ror#5
++      sli     v16.4s,v19.4s,#15
++      add     w10,w10,w14
++      orr     w12,w12,w15
++      ushr    v17.4s,v19.4s,#10
++      eor     w11,w11,w6,ror#19
++      eor     w15,w10,w10,ror#11
++      ushr    v7.4s,v19.4s,#19
++      add     w9,w9,w12
++      ror     w11,w11,#6
++      add     v2.4s,v2.4s,v5.4s
++      eor     w14,w10,w3
++      eor     w15,w15,w10,ror#20
++      sli     v7.4s,v19.4s,#13
++      add     w9,w9,w11
++      ldr     w12,[sp,#40]
++      and     w13,w13,w14
++      eor     v17.16b,v17.16b,v16.16b
++      ror     w15,w15,#2
++      add     w5,w5,w9
++      eor     w13,w13,w3
++      eor     v17.16b,v17.16b,v7.16b
++      add     w8,w8,w12
++      add     w9,w9,w15
++      and     w12,w6,w5
++      add     v2.4s,v2.4s,v17.4s
++      bic     w15,w7,w5
++      eor     w11,w5,w5,ror#5
++      add     w9,w9,w13
++      ushr    v18.4s,v2.4s,#17
++      orr     w12,w12,w15
++      ushr    v19.4s,v2.4s,#10
++      eor     w11,w11,w5,ror#19
++      eor     w15,w9,w9,ror#11
++      sli     v18.4s,v2.4s,#15
++      add     w8,w8,w12
++      ushr    v17.4s,v2.4s,#19
++      ror     w11,w11,#6
++      eor     w13,w9,w10
++      eor     v19.16b,v19.16b,v18.16b
++      eor     w15,w15,w9,ror#20
++      add     w8,w8,w11
++      sli     v17.4s,v2.4s,#13
++      ldr     w12,[sp,#44]
++      and     w14,w14,w13
++      ror     w15,w15,#2
++      ld1     {v4.4s},[x16], #16
++      add     w4,w4,w8
++      eor     v19.16b,v19.16b,v17.16b
++      eor     w14,w14,w10
++      eor     v17.16b,v17.16b,v17.16b
++      add     w7,w7,w12
++      add     w8,w8,w15
++      and     w12,w5,w4
++      mov     v17.d[1],v19.d[0]
++      bic     w15,w6,w4
++      eor     w11,w4,w4,ror#5
++      add     w8,w8,w14
++      add     v2.4s,v2.4s,v17.4s
++      orr     w12,w12,w15
++      eor     w11,w11,w4,ror#19
++      eor     w15,w8,w8,ror#11
++      add     v4.4s,v4.4s,v2.4s
++      add     w7,w7,w12
++      ror     w11,w11,#6
++      eor     w14,w8,w9
++      eor     w15,w15,w8,ror#20
++      add     w7,w7,w11
++      ldr     w12,[sp,#48]
++      and     w13,w13,w14
++      ror     w15,w15,#2
++      add     w3,w3,w7
++      eor     w13,w13,w9
++      st1     {v4.4s},[x17], #16
++      ext     v4.16b,v3.16b,v0.16b,#4
++      add     w6,w6,w12
++      add     w7,w7,w15
++      and     w12,w4,w3
++      bic     w15,w5,w3
++      ext     v7.16b,v1.16b,v2.16b,#4
++      eor     w11,w3,w3,ror#5
++      add     w7,w7,w13
++      mov     d19,v2.d[1]
++      orr     w12,w12,w15
++      eor     w11,w11,w3,ror#19
++      ushr    v6.4s,v4.4s,#7
++      eor     w15,w7,w7,ror#11
++      ushr    v5.4s,v4.4s,#3
++      add     w6,w6,w12
++      add     v3.4s,v3.4s,v7.4s
++      ror     w11,w11,#6
++      sli     v6.4s,v4.4s,#25
++      eor     w13,w7,w8
++      eor     w15,w15,w7,ror#20
++      ushr    v7.4s,v4.4s,#18
++      add     w6,w6,w11
++      ldr     w12,[sp,#52]
++      and     w14,w14,w13
++      eor     v5.16b,v5.16b,v6.16b
++      ror     w15,w15,#2
++      add     w10,w10,w6
++      sli     v7.4s,v4.4s,#14
++      eor     w14,w14,w8
++      ushr    v16.4s,v19.4s,#17
++      add     w5,w5,w12
++      add     w6,w6,w15
++      and     w12,w3,w10
++      eor     v5.16b,v5.16b,v7.16b
++      bic     w15,w4,w10
++      eor     w11,w10,w10,ror#5
++      sli     v16.4s,v19.4s,#15
++      add     w6,w6,w14
++      orr     w12,w12,w15
++      ushr    v17.4s,v19.4s,#10
++      eor     w11,w11,w10,ror#19
++      eor     w15,w6,w6,ror#11
++      ushr    v7.4s,v19.4s,#19
++      add     w5,w5,w12
++      ror     w11,w11,#6
++      add     v3.4s,v3.4s,v5.4s
++      eor     w14,w6,w7
++      eor     w15,w15,w6,ror#20
++      sli     v7.4s,v19.4s,#13
++      add     w5,w5,w11
++      ldr     w12,[sp,#56]
++      and     w13,w13,w14
++      eor     v17.16b,v17.16b,v16.16b
++      ror     w15,w15,#2
++      add     w9,w9,w5
++      eor     w13,w13,w7
++      eor     v17.16b,v17.16b,v7.16b
++      add     w4,w4,w12
++      add     w5,w5,w15
++      and     w12,w10,w9
++      add     v3.4s,v3.4s,v17.4s
++      bic     w15,w3,w9
++      eor     w11,w9,w9,ror#5
++      add     w5,w5,w13
++      ushr    v18.4s,v3.4s,#17
++      orr     w12,w12,w15
++      ushr    v19.4s,v3.4s,#10
++      eor     w11,w11,w9,ror#19
++      eor     w15,w5,w5,ror#11
++      sli     v18.4s,v3.4s,#15
++      add     w4,w4,w12
++      ushr    v17.4s,v3.4s,#19
++      ror     w11,w11,#6
++      eor     w13,w5,w6
++      eor     v19.16b,v19.16b,v18.16b
++      eor     w15,w15,w5,ror#20
++      add     w4,w4,w11
++      sli     v17.4s,v3.4s,#13
++      ldr     w12,[sp,#60]
++      and     w14,w14,w13
++      ror     w15,w15,#2
++      ld1     {v4.4s},[x16], #16
++      add     w8,w8,w4
++      eor     v19.16b,v19.16b,v17.16b
++      eor     w14,w14,w6
++      eor     v17.16b,v17.16b,v17.16b
++      add     w3,w3,w12
++      add     w4,w4,w15
++      and     w12,w9,w8
++      mov     v17.d[1],v19.d[0]
++      bic     w15,w10,w8
++      eor     w11,w8,w8,ror#5
++      add     w4,w4,w14
++      add     v3.4s,v3.4s,v17.4s
++      orr     w12,w12,w15
++      eor     w11,w11,w8,ror#19
++      eor     w15,w4,w4,ror#11
++      add     v4.4s,v4.4s,v3.4s
++      add     w3,w3,w12
++      ror     w11,w11,#6
++      eor     w14,w4,w5
++      eor     w15,w15,w4,ror#20
++      add     w3,w3,w11
++      ldr     w12,[x16]
++      and     w13,w13,w14
++      ror     w15,w15,#2
++      add     w7,w7,w3
++      eor     w13,w13,w5
++      st1     {v4.4s},[x17], #16
++      cmp     w12,#0                          // check for K256 terminator
++      ldr     w12,[sp,#0]
++      sub     x17,x17,#64
++      bne     .L_00_48
++
++      sub     x16,x16,#256            // rewind x16
++      cmp     x1,x2
++      mov     x17, #64
++      csel    x17, x17, xzr, eq
++      sub     x1,x1,x17                       // avoid SEGV
++      mov     x17,sp
++      add     w10,w10,w12
++      add     w3,w3,w15
++      and     w12,w8,w7
++      ld1     {v0.16b},[x1],#16
++      bic     w15,w9,w7
++      eor     w11,w7,w7,ror#5
++      ld1     {v4.4s},[x16],#16
++      add     w3,w3,w13
++      orr     w12,w12,w15
++      eor     w11,w11,w7,ror#19
++      eor     w15,w3,w3,ror#11
++      rev32   v0.16b,v0.16b
++      add     w10,w10,w12
++      ror     w11,w11,#6
++      eor     w13,w3,w4
++      eor     w15,w15,w3,ror#20
++      add     v4.4s,v4.4s,v0.4s
++      add     w10,w10,w11
++      ldr     w12,[sp,#4]
++      and     w14,w14,w13
++      ror     w15,w15,#2
++      add     w6,w6,w10
++      eor     w14,w14,w4
++      add     w9,w9,w12
++      add     w10,w10,w15
++      and     w12,w7,w6
++      bic     w15,w8,w6
++      eor     w11,w6,w6,ror#5
++      add     w10,w10,w14
++      orr     w12,w12,w15
++      eor     w11,w11,w6,ror#19
++      eor     w15,w10,w10,ror#11
++      add     w9,w9,w12
++      ror     w11,w11,#6
++      eor     w14,w10,w3
++      eor     w15,w15,w10,ror#20
++      add     w9,w9,w11
++      ldr     w12,[sp,#8]
++      and     w13,w13,w14
++      ror     w15,w15,#2
++      add     w5,w5,w9
++      eor     w13,w13,w3
++      add     w8,w8,w12
++      add     w9,w9,w15
++      and     w12,w6,w5
++      bic     w15,w7,w5
++      eor     w11,w5,w5,ror#5
++      add     w9,w9,w13
++      orr     w12,w12,w15
++      eor     w11,w11,w5,ror#19
++      eor     w15,w9,w9,ror#11
++      add     w8,w8,w12
++      ror     w11,w11,#6
++      eor     w13,w9,w10
++      eor     w15,w15,w9,ror#20
++      add     w8,w8,w11
++      ldr     w12,[sp,#12]
++      and     w14,w14,w13
++      ror     w15,w15,#2
++      add     w4,w4,w8
++      eor     w14,w14,w10
++      add     w7,w7,w12
++      add     w8,w8,w15
++      and     w12,w5,w4
++      bic     w15,w6,w4
++      eor     w11,w4,w4,ror#5
++      add     w8,w8,w14
++      orr     w12,w12,w15
++      eor     w11,w11,w4,ror#19
++      eor     w15,w8,w8,ror#11
++      add     w7,w7,w12
++      ror     w11,w11,#6
++      eor     w14,w8,w9
++      eor     w15,w15,w8,ror#20
++      add     w7,w7,w11
++      ldr     w12,[sp,#16]
++      and     w13,w13,w14
++      ror     w15,w15,#2
++      add     w3,w3,w7
++      eor     w13,w13,w9
++      st1     {v4.4s},[x17], #16
++      add     w6,w6,w12
++      add     w7,w7,w15
++      and     w12,w4,w3
++      ld1     {v1.16b},[x1],#16
++      bic     w15,w5,w3
++      eor     w11,w3,w3,ror#5
++      ld1     {v4.4s},[x16],#16
++      add     w7,w7,w13
++      orr     w12,w12,w15
++      eor     w11,w11,w3,ror#19
++      eor     w15,w7,w7,ror#11
++      rev32   v1.16b,v1.16b
++      add     w6,w6,w12
++      ror     w11,w11,#6
++      eor     w13,w7,w8
++      eor     w15,w15,w7,ror#20
++      add     v4.4s,v4.4s,v1.4s
++      add     w6,w6,w11
++      ldr     w12,[sp,#20]
++      and     w14,w14,w13
++      ror     w15,w15,#2
++      add     w10,w10,w6
++      eor     w14,w14,w8
++      add     w5,w5,w12
++      add     w6,w6,w15
++      and     w12,w3,w10
++      bic     w15,w4,w10
++      eor     w11,w10,w10,ror#5
++      add     w6,w6,w14
++      orr     w12,w12,w15
++      eor     w11,w11,w10,ror#19
++      eor     w15,w6,w6,ror#11
++      add     w5,w5,w12
++      ror     w11,w11,#6
++      eor     w14,w6,w7
++      eor     w15,w15,w6,ror#20
++      add     w5,w5,w11
++      ldr     w12,[sp,#24]
++      and     w13,w13,w14
++      ror     w15,w15,#2
++      add     w9,w9,w5
++      eor     w13,w13,w7
++      add     w4,w4,w12
++      add     w5,w5,w15
++      and     w12,w10,w9
++      bic     w15,w3,w9
++      eor     w11,w9,w9,ror#5
++      add     w5,w5,w13
++      orr     w12,w12,w15
++      eor     w11,w11,w9,ror#19
++      eor     w15,w5,w5,ror#11
++      add     w4,w4,w12
++      ror     w11,w11,#6
++      eor     w13,w5,w6
++      eor     w15,w15,w5,ror#20
++      add     w4,w4,w11
++      ldr     w12,[sp,#28]
++      and     w14,w14,w13
++      ror     w15,w15,#2
++      add     w8,w8,w4
++      eor     w14,w14,w6
++      add     w3,w3,w12
++      add     w4,w4,w15
++      and     w12,w9,w8
++      bic     w15,w10,w8
++      eor     w11,w8,w8,ror#5
++      add     w4,w4,w14
++      orr     w12,w12,w15
++      eor     w11,w11,w8,ror#19
++      eor     w15,w4,w4,ror#11
++      add     w3,w3,w12
++      ror     w11,w11,#6
++      eor     w14,w4,w5
++      eor     w15,w15,w4,ror#20
++      add     w3,w3,w11
++      ldr     w12,[sp,#32]
++      and     w13,w13,w14
++      ror     w15,w15,#2
++      add     w7,w7,w3
++      eor     w13,w13,w5
++      st1     {v4.4s},[x17], #16
++      add     w10,w10,w12
++      add     w3,w3,w15
++      and     w12,w8,w7
++      ld1     {v2.16b},[x1],#16
++      bic     w15,w9,w7
++      eor     w11,w7,w7,ror#5
++      ld1     {v4.4s},[x16],#16
++      add     w3,w3,w13
++      orr     w12,w12,w15
++      eor     w11,w11,w7,ror#19
++      eor     w15,w3,w3,ror#11
++      rev32   v2.16b,v2.16b
++      add     w10,w10,w12
++      ror     w11,w11,#6
++      eor     w13,w3,w4
++      eor     w15,w15,w3,ror#20
++      add     v4.4s,v4.4s,v2.4s
++      add     w10,w10,w11
++      ldr     w12,[sp,#36]
++      and     w14,w14,w13
++      ror     w15,w15,#2
++      add     w6,w6,w10
++      eor     w14,w14,w4
++      add     w9,w9,w12
++      add     w10,w10,w15
++      and     w12,w7,w6
++      bic     w15,w8,w6
++      eor     w11,w6,w6,ror#5
++      add     w10,w10,w14
++      orr     w12,w12,w15
++      eor     w11,w11,w6,ror#19
++      eor     w15,w10,w10,ror#11
++      add     w9,w9,w12
++      ror     w11,w11,#6
++      eor     w14,w10,w3
++      eor     w15,w15,w10,ror#20
++      add     w9,w9,w11
++      ldr     w12,[sp,#40]
++      and     w13,w13,w14
++      ror     w15,w15,#2
++      add     w5,w5,w9
++      eor     w13,w13,w3
++      add     w8,w8,w12
++      add     w9,w9,w15
++      and     w12,w6,w5
++      bic     w15,w7,w5
++      eor     w11,w5,w5,ror#5
++      add     w9,w9,w13
++      orr     w12,w12,w15
++      eor     w11,w11,w5,ror#19
++      eor     w15,w9,w9,ror#11
++      add     w8,w8,w12
++      ror     w11,w11,#6
++      eor     w13,w9,w10
++      eor     w15,w15,w9,ror#20
++      add     w8,w8,w11
++      ldr     w12,[sp,#44]
++      and     w14,w14,w13
++      ror     w15,w15,#2
++      add     w4,w4,w8
++      eor     w14,w14,w10
++      add     w7,w7,w12
++      add     w8,w8,w15
++      and     w12,w5,w4
++      bic     w15,w6,w4
++      eor     w11,w4,w4,ror#5
++      add     w8,w8,w14
++      orr     w12,w12,w15
++      eor     w11,w11,w4,ror#19
++      eor     w15,w8,w8,ror#11
++      add     w7,w7,w12
++      ror     w11,w11,#6
++      eor     w14,w8,w9
++      eor     w15,w15,w8,ror#20
++      add     w7,w7,w11
++      ldr     w12,[sp,#48]
++      and     w13,w13,w14
++      ror     w15,w15,#2
++      add     w3,w3,w7
++      eor     w13,w13,w9
++      st1     {v4.4s},[x17], #16
++      add     w6,w6,w12
++      add     w7,w7,w15
++      and     w12,w4,w3
++      ld1     {v3.16b},[x1],#16
++      bic     w15,w5,w3
++      eor     w11,w3,w3,ror#5
++      ld1     {v4.4s},[x16],#16
++      add     w7,w7,w13
++      orr     w12,w12,w15
++      eor     w11,w11,w3,ror#19
++      eor     w15,w7,w7,ror#11
++      rev32   v3.16b,v3.16b
++      add     w6,w6,w12
++      ror     w11,w11,#6
++      eor     w13,w7,w8
++      eor     w15,w15,w7,ror#20
++      add     v4.4s,v4.4s,v3.4s
++      add     w6,w6,w11
++      ldr     w12,[sp,#52]
++      and     w14,w14,w13
++      ror     w15,w15,#2
++      add     w10,w10,w6
++      eor     w14,w14,w8
++      add     w5,w5,w12
++      add     w6,w6,w15
++      and     w12,w3,w10
++      bic     w15,w4,w10
++      eor     w11,w10,w10,ror#5
++      add     w6,w6,w14
++      orr     w12,w12,w15
++      eor     w11,w11,w10,ror#19
++      eor     w15,w6,w6,ror#11
++      add     w5,w5,w12
++      ror     w11,w11,#6
++      eor     w14,w6,w7
++      eor     w15,w15,w6,ror#20
++      add     w5,w5,w11
++      ldr     w12,[sp,#56]
++      and     w13,w13,w14
++      ror     w15,w15,#2
++      add     w9,w9,w5
++      eor     w13,w13,w7
++      add     w4,w4,w12
++      add     w5,w5,w15
++      and     w12,w10,w9
++      bic     w15,w3,w9
++      eor     w11,w9,w9,ror#5
++      add     w5,w5,w13
++      orr     w12,w12,w15
++      eor     w11,w11,w9,ror#19
++      eor     w15,w5,w5,ror#11
++      add     w4,w4,w12
++      ror     w11,w11,#6
++      eor     w13,w5,w6
++      eor     w15,w15,w5,ror#20
++      add     w4,w4,w11
++      ldr     w12,[sp,#60]
++      and     w14,w14,w13
++      ror     w15,w15,#2
++      add     w8,w8,w4
++      eor     w14,w14,w6
++      add     w3,w3,w12
++      add     w4,w4,w15
++      and     w12,w9,w8
++      bic     w15,w10,w8
++      eor     w11,w8,w8,ror#5
++      add     w4,w4,w14
++      orr     w12,w12,w15
++      eor     w11,w11,w8,ror#19
++      eor     w15,w4,w4,ror#11
++      add     w3,w3,w12
++      ror     w11,w11,#6
++      eor     w14,w4,w5
++      eor     w15,w15,w4,ror#20
++      add     w3,w3,w11
++      and     w13,w13,w14
++      ror     w15,w15,#2
++      add     w7,w7,w3
++      eor     w13,w13,w5
++      st1     {v4.4s},[x17], #16
++      add     w3,w3,w15                       // h+=Sigma0(a) from the past
++      ldp     w11,w12,[x0,#0]
++      add     w3,w3,w13                       // h+=Maj(a,b,c) from the past
++      ldp     w13,w14,[x0,#8]
++      add     w3,w3,w11                       // accumulate
++      add     w4,w4,w12
++      ldp     w11,w12,[x0,#16]
++      add     w5,w5,w13
++      add     w6,w6,w14
++      ldp     w13,w14,[x0,#24]
++      add     w7,w7,w11
++      add     w8,w8,w12
++       ldr    w12,[sp,#0]
++      stp     w3,w4,[x0,#0]
++      add     w9,w9,w13
++       mov    w13,wzr
++      stp     w5,w6,[x0,#8]
++      add     w10,w10,w14
++      stp     w7,w8,[x0,#16]
++       eor    w14,w4,w5
++      stp     w9,w10,[x0,#24]
++       mov    w15,wzr
++       mov    x17,sp
++      b.ne    .L_00_48
++
++      ldr     x29,[x29]
++      add     sp,sp,#16*4+16
++      ret
++.size sha256_block_neon,.-sha256_block_neon
++#ifndef       __KERNEL__
++.comm OPENSSL_armcap_P,4,4
++#endif
+--- /dev/null
++++ b/arch/arm64/crypto/sha512-core.S
+@@ -0,0 +1,1085 @@
++// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
++//
++// Licensed under the OpenSSL license (the "License").  You may not use
++// this file except in compliance with the License.  You can obtain a copy
++// in the file LICENSE in the source distribution or at
++// https://www.openssl.org/source/license.html
++
++// ====================================================================
++// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
++// project. The module is, however, dual licensed under OpenSSL and
++// CRYPTOGAMS licenses depending on where you obtain it. For further
++// details see http://www.openssl.org/~appro/cryptogams/.
++//
++// Permission to use under GPLv2 terms is granted.
++// ====================================================================
++//
++// SHA256/512 for ARMv8.
++//
++// Performance in cycles per processed byte and improvement coefficient
++// over code generated with "default" compiler:
++//
++//            SHA256-hw       SHA256(*)       SHA512
++// Apple A7   1.97            10.5 (+33%)     6.73 (-1%(**))
++// Cortex-A53 2.38            15.5 (+115%)    10.0 (+150%(***))
++// Cortex-A57 2.31            11.6 (+86%)     7.51 (+260%(***))
++// Denver     2.01            10.5 (+26%)     6.70 (+8%)
++// X-Gene                     20.0 (+100%)    12.8 (+300%(***))
++// Mongoose   2.36            13.0 (+50%)     8.36 (+33%)
++//
++// (*)        Software SHA256 results are of lesser relevance, presented
++//    mostly for informational purposes.
++// (**)       The result is a trade-off: it's possible to improve it by
++//    10% (or by 1 cycle per round), but at the cost of 20% loss
++//    on Cortex-A53 (or by 4 cycles per round).
++// (***)      Super-impressive coefficients over gcc-generated code are
++//    indication of some compiler "pathology", most notably code
++//    generated with -mgeneral-regs-only is significanty faster
++//    and the gap is only 40-90%.
++//
++// October 2016.
++//
++// Originally it was reckoned that it makes no sense to implement NEON
++// version of SHA256 for 64-bit processors. This is because performance
++// improvement on most wide-spread Cortex-A5x processors was observed
++// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
++// observed that 32-bit NEON SHA256 performs significantly better than
++// 64-bit scalar version on *some* of the more recent processors. As
++// result 64-bit NEON version of SHA256 was added to provide best
++// all-round performance. For example it executes ~30% faster on X-Gene
++// and Mongoose. [For reference, NEON version of SHA512 is bound to
++// deliver much less improvement, likely *negative* on Cortex-A5x.
++// Which is why NEON support is limited to SHA256.]
++
++#ifndef       __KERNEL__
++# include "arm_arch.h"
++#endif
++
++.text
++
++.extern       OPENSSL_armcap_P
++.globl        sha512_block_data_order
++.type sha512_block_data_order,%function
++.align        6
++sha512_block_data_order:
++      stp     x29,x30,[sp,#-128]!
++      add     x29,sp,#0
++
++      stp     x19,x20,[sp,#16]
++      stp     x21,x22,[sp,#32]
++      stp     x23,x24,[sp,#48]
++      stp     x25,x26,[sp,#64]
++      stp     x27,x28,[sp,#80]
++      sub     sp,sp,#4*8
++
++      ldp     x20,x21,[x0]                            // load context
++      ldp     x22,x23,[x0,#2*8]
++      ldp     x24,x25,[x0,#4*8]
++      add     x2,x1,x2,lsl#7  // end of input
++      ldp     x26,x27,[x0,#6*8]
++      adr     x30,.LK512
++      stp     x0,x2,[x29,#96]
++
++.Loop:
++      ldp     x3,x4,[x1],#2*8
++      ldr     x19,[x30],#8                    // *K++
++      eor     x28,x21,x22                             // magic seed
++      str     x1,[x29,#112]
++#ifndef       __AARCH64EB__
++      rev     x3,x3                   // 0
++#endif
++      ror     x16,x24,#14
++      add     x27,x27,x19                     // h+=K[i]
++      eor     x6,x24,x24,ror#23
++      and     x17,x25,x24
++      bic     x19,x26,x24
++      add     x27,x27,x3                      // h+=X[i]
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x20,x21                     // a^b, b^c in next round
++      eor     x16,x16,x6,ror#18       // Sigma1(e)
++      ror     x6,x20,#28
++      add     x27,x27,x17                     // h+=Ch(e,f,g)
++      eor     x17,x20,x20,ror#5
++      add     x27,x27,x16                     // h+=Sigma1(e)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      add     x23,x23,x27                     // d+=h
++      eor     x28,x28,x21                     // Maj(a,b,c)
++      eor     x17,x6,x17,ror#34       // Sigma0(a)
++      add     x27,x27,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      //add   x27,x27,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x4,x4                   // 1
++#endif
++      ldp     x5,x6,[x1],#2*8
++      add     x27,x27,x17                     // h+=Sigma0(a)
++      ror     x16,x23,#14
++      add     x26,x26,x28                     // h+=K[i]
++      eor     x7,x23,x23,ror#23
++      and     x17,x24,x23
++      bic     x28,x25,x23
++      add     x26,x26,x4                      // h+=X[i]
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x27,x20                     // a^b, b^c in next round
++      eor     x16,x16,x7,ror#18       // Sigma1(e)
++      ror     x7,x27,#28
++      add     x26,x26,x17                     // h+=Ch(e,f,g)
++      eor     x17,x27,x27,ror#5
++      add     x26,x26,x16                     // h+=Sigma1(e)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      add     x22,x22,x26                     // d+=h
++      eor     x19,x19,x20                     // Maj(a,b,c)
++      eor     x17,x7,x17,ror#34       // Sigma0(a)
++      add     x26,x26,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      //add   x26,x26,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x5,x5                   // 2
++#endif
++      add     x26,x26,x17                     // h+=Sigma0(a)
++      ror     x16,x22,#14
++      add     x25,x25,x19                     // h+=K[i]
++      eor     x8,x22,x22,ror#23
++      and     x17,x23,x22
++      bic     x19,x24,x22
++      add     x25,x25,x5                      // h+=X[i]
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x26,x27                     // a^b, b^c in next round
++      eor     x16,x16,x8,ror#18       // Sigma1(e)
++      ror     x8,x26,#28
++      add     x25,x25,x17                     // h+=Ch(e,f,g)
++      eor     x17,x26,x26,ror#5
++      add     x25,x25,x16                     // h+=Sigma1(e)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      add     x21,x21,x25                     // d+=h
++      eor     x28,x28,x27                     // Maj(a,b,c)
++      eor     x17,x8,x17,ror#34       // Sigma0(a)
++      add     x25,x25,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      //add   x25,x25,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x6,x6                   // 3
++#endif
++      ldp     x7,x8,[x1],#2*8
++      add     x25,x25,x17                     // h+=Sigma0(a)
++      ror     x16,x21,#14
++      add     x24,x24,x28                     // h+=K[i]
++      eor     x9,x21,x21,ror#23
++      and     x17,x22,x21
++      bic     x28,x23,x21
++      add     x24,x24,x6                      // h+=X[i]
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x25,x26                     // a^b, b^c in next round
++      eor     x16,x16,x9,ror#18       // Sigma1(e)
++      ror     x9,x25,#28
++      add     x24,x24,x17                     // h+=Ch(e,f,g)
++      eor     x17,x25,x25,ror#5
++      add     x24,x24,x16                     // h+=Sigma1(e)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      add     x20,x20,x24                     // d+=h
++      eor     x19,x19,x26                     // Maj(a,b,c)
++      eor     x17,x9,x17,ror#34       // Sigma0(a)
++      add     x24,x24,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      //add   x24,x24,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x7,x7                   // 4
++#endif
++      add     x24,x24,x17                     // h+=Sigma0(a)
++      ror     x16,x20,#14
++      add     x23,x23,x19                     // h+=K[i]
++      eor     x10,x20,x20,ror#23
++      and     x17,x21,x20
++      bic     x19,x22,x20
++      add     x23,x23,x7                      // h+=X[i]
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x24,x25                     // a^b, b^c in next round
++      eor     x16,x16,x10,ror#18      // Sigma1(e)
++      ror     x10,x24,#28
++      add     x23,x23,x17                     // h+=Ch(e,f,g)
++      eor     x17,x24,x24,ror#5
++      add     x23,x23,x16                     // h+=Sigma1(e)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      add     x27,x27,x23                     // d+=h
++      eor     x28,x28,x25                     // Maj(a,b,c)
++      eor     x17,x10,x17,ror#34      // Sigma0(a)
++      add     x23,x23,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      //add   x23,x23,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x8,x8                   // 5
++#endif
++      ldp     x9,x10,[x1],#2*8
++      add     x23,x23,x17                     // h+=Sigma0(a)
++      ror     x16,x27,#14
++      add     x22,x22,x28                     // h+=K[i]
++      eor     x11,x27,x27,ror#23
++      and     x17,x20,x27
++      bic     x28,x21,x27
++      add     x22,x22,x8                      // h+=X[i]
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x23,x24                     // a^b, b^c in next round
++      eor     x16,x16,x11,ror#18      // Sigma1(e)
++      ror     x11,x23,#28
++      add     x22,x22,x17                     // h+=Ch(e,f,g)
++      eor     x17,x23,x23,ror#5
++      add     x22,x22,x16                     // h+=Sigma1(e)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      add     x26,x26,x22                     // d+=h
++      eor     x19,x19,x24                     // Maj(a,b,c)
++      eor     x17,x11,x17,ror#34      // Sigma0(a)
++      add     x22,x22,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      //add   x22,x22,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x9,x9                   // 6
++#endif
++      add     x22,x22,x17                     // h+=Sigma0(a)
++      ror     x16,x26,#14
++      add     x21,x21,x19                     // h+=K[i]
++      eor     x12,x26,x26,ror#23
++      and     x17,x27,x26
++      bic     x19,x20,x26
++      add     x21,x21,x9                      // h+=X[i]
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x22,x23                     // a^b, b^c in next round
++      eor     x16,x16,x12,ror#18      // Sigma1(e)
++      ror     x12,x22,#28
++      add     x21,x21,x17                     // h+=Ch(e,f,g)
++      eor     x17,x22,x22,ror#5
++      add     x21,x21,x16                     // h+=Sigma1(e)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      add     x25,x25,x21                     // d+=h
++      eor     x28,x28,x23                     // Maj(a,b,c)
++      eor     x17,x12,x17,ror#34      // Sigma0(a)
++      add     x21,x21,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      //add   x21,x21,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x10,x10                 // 7
++#endif
++      ldp     x11,x12,[x1],#2*8
++      add     x21,x21,x17                     // h+=Sigma0(a)
++      ror     x16,x25,#14
++      add     x20,x20,x28                     // h+=K[i]
++      eor     x13,x25,x25,ror#23
++      and     x17,x26,x25
++      bic     x28,x27,x25
++      add     x20,x20,x10                     // h+=X[i]
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x21,x22                     // a^b, b^c in next round
++      eor     x16,x16,x13,ror#18      // Sigma1(e)
++      ror     x13,x21,#28
++      add     x20,x20,x17                     // h+=Ch(e,f,g)
++      eor     x17,x21,x21,ror#5
++      add     x20,x20,x16                     // h+=Sigma1(e)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      add     x24,x24,x20                     // d+=h
++      eor     x19,x19,x22                     // Maj(a,b,c)
++      eor     x17,x13,x17,ror#34      // Sigma0(a)
++      add     x20,x20,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      //add   x20,x20,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x11,x11                 // 8
++#endif
++      add     x20,x20,x17                     // h+=Sigma0(a)
++      ror     x16,x24,#14
++      add     x27,x27,x19                     // h+=K[i]
++      eor     x14,x24,x24,ror#23
++      and     x17,x25,x24
++      bic     x19,x26,x24
++      add     x27,x27,x11                     // h+=X[i]
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x20,x21                     // a^b, b^c in next round
++      eor     x16,x16,x14,ror#18      // Sigma1(e)
++      ror     x14,x20,#28
++      add     x27,x27,x17                     // h+=Ch(e,f,g)
++      eor     x17,x20,x20,ror#5
++      add     x27,x27,x16                     // h+=Sigma1(e)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      add     x23,x23,x27                     // d+=h
++      eor     x28,x28,x21                     // Maj(a,b,c)
++      eor     x17,x14,x17,ror#34      // Sigma0(a)
++      add     x27,x27,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      //add   x27,x27,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x12,x12                 // 9
++#endif
++      ldp     x13,x14,[x1],#2*8
++      add     x27,x27,x17                     // h+=Sigma0(a)
++      ror     x16,x23,#14
++      add     x26,x26,x28                     // h+=K[i]
++      eor     x15,x23,x23,ror#23
++      and     x17,x24,x23
++      bic     x28,x25,x23
++      add     x26,x26,x12                     // h+=X[i]
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x27,x20                     // a^b, b^c in next round
++      eor     x16,x16,x15,ror#18      // Sigma1(e)
++      ror     x15,x27,#28
++      add     x26,x26,x17                     // h+=Ch(e,f,g)
++      eor     x17,x27,x27,ror#5
++      add     x26,x26,x16                     // h+=Sigma1(e)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      add     x22,x22,x26                     // d+=h
++      eor     x19,x19,x20                     // Maj(a,b,c)
++      eor     x17,x15,x17,ror#34      // Sigma0(a)
++      add     x26,x26,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      //add   x26,x26,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x13,x13                 // 10
++#endif
++      add     x26,x26,x17                     // h+=Sigma0(a)
++      ror     x16,x22,#14
++      add     x25,x25,x19                     // h+=K[i]
++      eor     x0,x22,x22,ror#23
++      and     x17,x23,x22
++      bic     x19,x24,x22
++      add     x25,x25,x13                     // h+=X[i]
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x26,x27                     // a^b, b^c in next round
++      eor     x16,x16,x0,ror#18       // Sigma1(e)
++      ror     x0,x26,#28
++      add     x25,x25,x17                     // h+=Ch(e,f,g)
++      eor     x17,x26,x26,ror#5
++      add     x25,x25,x16                     // h+=Sigma1(e)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      add     x21,x21,x25                     // d+=h
++      eor     x28,x28,x27                     // Maj(a,b,c)
++      eor     x17,x0,x17,ror#34       // Sigma0(a)
++      add     x25,x25,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      //add   x25,x25,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x14,x14                 // 11
++#endif
++      ldp     x15,x0,[x1],#2*8
++      add     x25,x25,x17                     // h+=Sigma0(a)
++      str     x6,[sp,#24]
++      ror     x16,x21,#14
++      add     x24,x24,x28                     // h+=K[i]
++      eor     x6,x21,x21,ror#23
++      and     x17,x22,x21
++      bic     x28,x23,x21
++      add     x24,x24,x14                     // h+=X[i]
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x25,x26                     // a^b, b^c in next round
++      eor     x16,x16,x6,ror#18       // Sigma1(e)
++      ror     x6,x25,#28
++      add     x24,x24,x17                     // h+=Ch(e,f,g)
++      eor     x17,x25,x25,ror#5
++      add     x24,x24,x16                     // h+=Sigma1(e)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      add     x20,x20,x24                     // d+=h
++      eor     x19,x19,x26                     // Maj(a,b,c)
++      eor     x17,x6,x17,ror#34       // Sigma0(a)
++      add     x24,x24,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      //add   x24,x24,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x15,x15                 // 12
++#endif
++      add     x24,x24,x17                     // h+=Sigma0(a)
++      str     x7,[sp,#0]
++      ror     x16,x20,#14
++      add     x23,x23,x19                     // h+=K[i]
++      eor     x7,x20,x20,ror#23
++      and     x17,x21,x20
++      bic     x19,x22,x20
++      add     x23,x23,x15                     // h+=X[i]
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x24,x25                     // a^b, b^c in next round
++      eor     x16,x16,x7,ror#18       // Sigma1(e)
++      ror     x7,x24,#28
++      add     x23,x23,x17                     // h+=Ch(e,f,g)
++      eor     x17,x24,x24,ror#5
++      add     x23,x23,x16                     // h+=Sigma1(e)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      add     x27,x27,x23                     // d+=h
++      eor     x28,x28,x25                     // Maj(a,b,c)
++      eor     x17,x7,x17,ror#34       // Sigma0(a)
++      add     x23,x23,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      //add   x23,x23,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x0,x0                   // 13
++#endif
++      ldp     x1,x2,[x1]
++      add     x23,x23,x17                     // h+=Sigma0(a)
++      str     x8,[sp,#8]
++      ror     x16,x27,#14
++      add     x22,x22,x28                     // h+=K[i]
++      eor     x8,x27,x27,ror#23
++      and     x17,x20,x27
++      bic     x28,x21,x27
++      add     x22,x22,x0                      // h+=X[i]
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x23,x24                     // a^b, b^c in next round
++      eor     x16,x16,x8,ror#18       // Sigma1(e)
++      ror     x8,x23,#28
++      add     x22,x22,x17                     // h+=Ch(e,f,g)
++      eor     x17,x23,x23,ror#5
++      add     x22,x22,x16                     // h+=Sigma1(e)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      add     x26,x26,x22                     // d+=h
++      eor     x19,x19,x24                     // Maj(a,b,c)
++      eor     x17,x8,x17,ror#34       // Sigma0(a)
++      add     x22,x22,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      //add   x22,x22,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x1,x1                   // 14
++#endif
++      ldr     x6,[sp,#24]
++      add     x22,x22,x17                     // h+=Sigma0(a)
++      str     x9,[sp,#16]
++      ror     x16,x26,#14
++      add     x21,x21,x19                     // h+=K[i]
++      eor     x9,x26,x26,ror#23
++      and     x17,x27,x26
++      bic     x19,x20,x26
++      add     x21,x21,x1                      // h+=X[i]
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x22,x23                     // a^b, b^c in next round
++      eor     x16,x16,x9,ror#18       // Sigma1(e)
++      ror     x9,x22,#28
++      add     x21,x21,x17                     // h+=Ch(e,f,g)
++      eor     x17,x22,x22,ror#5
++      add     x21,x21,x16                     // h+=Sigma1(e)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      add     x25,x25,x21                     // d+=h
++      eor     x28,x28,x23                     // Maj(a,b,c)
++      eor     x17,x9,x17,ror#34       // Sigma0(a)
++      add     x21,x21,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      //add   x21,x21,x17                     // h+=Sigma0(a)
++#ifndef       __AARCH64EB__
++      rev     x2,x2                   // 15
++#endif
++      ldr     x7,[sp,#0]
++      add     x21,x21,x17                     // h+=Sigma0(a)
++      str     x10,[sp,#24]
++      ror     x16,x25,#14
++      add     x20,x20,x28                     // h+=K[i]
++      ror     x9,x4,#1
++      and     x17,x26,x25
++      ror     x8,x1,#19
++      bic     x28,x27,x25
++      ror     x10,x21,#28
++      add     x20,x20,x2                      // h+=X[i]
++      eor     x16,x16,x25,ror#18
++      eor     x9,x9,x4,ror#8
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x21,x22                     // a^b, b^c in next round
++      eor     x16,x16,x25,ror#41      // Sigma1(e)
++      eor     x10,x10,x21,ror#34
++      add     x20,x20,x17                     // h+=Ch(e,f,g)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      eor     x8,x8,x1,ror#61
++      eor     x9,x9,x4,lsr#7  // sigma0(X[i+1])
++      add     x20,x20,x16                     // h+=Sigma1(e)
++      eor     x19,x19,x22                     // Maj(a,b,c)
++      eor     x17,x10,x21,ror#39      // Sigma0(a)
++      eor     x8,x8,x1,lsr#6  // sigma1(X[i+14])
++      add     x3,x3,x12
++      add     x24,x24,x20                     // d+=h
++      add     x20,x20,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      add     x3,x3,x9
++      add     x20,x20,x17                     // h+=Sigma0(a)
++      add     x3,x3,x8
++.Loop_16_xx:
++      ldr     x8,[sp,#8]
++      str     x11,[sp,#0]
++      ror     x16,x24,#14
++      add     x27,x27,x19                     // h+=K[i]
++      ror     x10,x5,#1
++      and     x17,x25,x24
++      ror     x9,x2,#19
++      bic     x19,x26,x24
++      ror     x11,x20,#28
++      add     x27,x27,x3                      // h+=X[i]
++      eor     x16,x16,x24,ror#18
++      eor     x10,x10,x5,ror#8
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x20,x21                     // a^b, b^c in next round
++      eor     x16,x16,x24,ror#41      // Sigma1(e)
++      eor     x11,x11,x20,ror#34
++      add     x27,x27,x17                     // h+=Ch(e,f,g)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      eor     x9,x9,x2,ror#61
++      eor     x10,x10,x5,lsr#7        // sigma0(X[i+1])
++      add     x27,x27,x16                     // h+=Sigma1(e)
++      eor     x28,x28,x21                     // Maj(a,b,c)
++      eor     x17,x11,x20,ror#39      // Sigma0(a)
++      eor     x9,x9,x2,lsr#6  // sigma1(X[i+14])
++      add     x4,x4,x13
++      add     x23,x23,x27                     // d+=h
++      add     x27,x27,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      add     x4,x4,x10
++      add     x27,x27,x17                     // h+=Sigma0(a)
++      add     x4,x4,x9
++      ldr     x9,[sp,#16]
++      str     x12,[sp,#8]
++      ror     x16,x23,#14
++      add     x26,x26,x28                     // h+=K[i]
++      ror     x11,x6,#1
++      and     x17,x24,x23
++      ror     x10,x3,#19
++      bic     x28,x25,x23
++      ror     x12,x27,#28
++      add     x26,x26,x4                      // h+=X[i]
++      eor     x16,x16,x23,ror#18
++      eor     x11,x11,x6,ror#8
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x27,x20                     // a^b, b^c in next round
++      eor     x16,x16,x23,ror#41      // Sigma1(e)
++      eor     x12,x12,x27,ror#34
++      add     x26,x26,x17                     // h+=Ch(e,f,g)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      eor     x10,x10,x3,ror#61
++      eor     x11,x11,x6,lsr#7        // sigma0(X[i+1])
++      add     x26,x26,x16                     // h+=Sigma1(e)
++      eor     x19,x19,x20                     // Maj(a,b,c)
++      eor     x17,x12,x27,ror#39      // Sigma0(a)
++      eor     x10,x10,x3,lsr#6        // sigma1(X[i+14])
++      add     x5,x5,x14
++      add     x22,x22,x26                     // d+=h
++      add     x26,x26,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      add     x5,x5,x11
++      add     x26,x26,x17                     // h+=Sigma0(a)
++      add     x5,x5,x10
++      ldr     x10,[sp,#24]
++      str     x13,[sp,#16]
++      ror     x16,x22,#14
++      add     x25,x25,x19                     // h+=K[i]
++      ror     x12,x7,#1
++      and     x17,x23,x22
++      ror     x11,x4,#19
++      bic     x19,x24,x22
++      ror     x13,x26,#28
++      add     x25,x25,x5                      // h+=X[i]
++      eor     x16,x16,x22,ror#18
++      eor     x12,x12,x7,ror#8
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x26,x27                     // a^b, b^c in next round
++      eor     x16,x16,x22,ror#41      // Sigma1(e)
++      eor     x13,x13,x26,ror#34
++      add     x25,x25,x17                     // h+=Ch(e,f,g)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      eor     x11,x11,x4,ror#61
++      eor     x12,x12,x7,lsr#7        // sigma0(X[i+1])
++      add     x25,x25,x16                     // h+=Sigma1(e)
++      eor     x28,x28,x27                     // Maj(a,b,c)
++      eor     x17,x13,x26,ror#39      // Sigma0(a)
++      eor     x11,x11,x4,lsr#6        // sigma1(X[i+14])
++      add     x6,x6,x15
++      add     x21,x21,x25                     // d+=h
++      add     x25,x25,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      add     x6,x6,x12
++      add     x25,x25,x17                     // h+=Sigma0(a)
++      add     x6,x6,x11
++      ldr     x11,[sp,#0]
++      str     x14,[sp,#24]
++      ror     x16,x21,#14
++      add     x24,x24,x28                     // h+=K[i]
++      ror     x13,x8,#1
++      and     x17,x22,x21
++      ror     x12,x5,#19
++      bic     x28,x23,x21
++      ror     x14,x25,#28
++      add     x24,x24,x6                      // h+=X[i]
++      eor     x16,x16,x21,ror#18
++      eor     x13,x13,x8,ror#8
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x25,x26                     // a^b, b^c in next round
++      eor     x16,x16,x21,ror#41      // Sigma1(e)
++      eor     x14,x14,x25,ror#34
++      add     x24,x24,x17                     // h+=Ch(e,f,g)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      eor     x12,x12,x5,ror#61
++      eor     x13,x13,x8,lsr#7        // sigma0(X[i+1])
++      add     x24,x24,x16                     // h+=Sigma1(e)
++      eor     x19,x19,x26                     // Maj(a,b,c)
++      eor     x17,x14,x25,ror#39      // Sigma0(a)
++      eor     x12,x12,x5,lsr#6        // sigma1(X[i+14])
++      add     x7,x7,x0
++      add     x20,x20,x24                     // d+=h
++      add     x24,x24,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      add     x7,x7,x13
++      add     x24,x24,x17                     // h+=Sigma0(a)
++      add     x7,x7,x12
++      ldr     x12,[sp,#8]
++      str     x15,[sp,#0]
++      ror     x16,x20,#14
++      add     x23,x23,x19                     // h+=K[i]
++      ror     x14,x9,#1
++      and     x17,x21,x20
++      ror     x13,x6,#19
++      bic     x19,x22,x20
++      ror     x15,x24,#28
++      add     x23,x23,x7                      // h+=X[i]
++      eor     x16,x16,x20,ror#18
++      eor     x14,x14,x9,ror#8
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x24,x25                     // a^b, b^c in next round
++      eor     x16,x16,x20,ror#41      // Sigma1(e)
++      eor     x15,x15,x24,ror#34
++      add     x23,x23,x17                     // h+=Ch(e,f,g)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      eor     x13,x13,x6,ror#61
++      eor     x14,x14,x9,lsr#7        // sigma0(X[i+1])
++      add     x23,x23,x16                     // h+=Sigma1(e)
++      eor     x28,x28,x25                     // Maj(a,b,c)
++      eor     x17,x15,x24,ror#39      // Sigma0(a)
++      eor     x13,x13,x6,lsr#6        // sigma1(X[i+14])
++      add     x8,x8,x1
++      add     x27,x27,x23                     // d+=h
++      add     x23,x23,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      add     x8,x8,x14
++      add     x23,x23,x17                     // h+=Sigma0(a)
++      add     x8,x8,x13
++      ldr     x13,[sp,#16]
++      str     x0,[sp,#8]
++      ror     x16,x27,#14
++      add     x22,x22,x28                     // h+=K[i]
++      ror     x15,x10,#1
++      and     x17,x20,x27
++      ror     x14,x7,#19
++      bic     x28,x21,x27
++      ror     x0,x23,#28
++      add     x22,x22,x8                      // h+=X[i]
++      eor     x16,x16,x27,ror#18
++      eor     x15,x15,x10,ror#8
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x23,x24                     // a^b, b^c in next round
++      eor     x16,x16,x27,ror#41      // Sigma1(e)
++      eor     x0,x0,x23,ror#34
++      add     x22,x22,x17                     // h+=Ch(e,f,g)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      eor     x14,x14,x7,ror#61
++      eor     x15,x15,x10,lsr#7       // sigma0(X[i+1])
++      add     x22,x22,x16                     // h+=Sigma1(e)
++      eor     x19,x19,x24                     // Maj(a,b,c)
++      eor     x17,x0,x23,ror#39       // Sigma0(a)
++      eor     x14,x14,x7,lsr#6        // sigma1(X[i+14])
++      add     x9,x9,x2
++      add     x26,x26,x22                     // d+=h
++      add     x22,x22,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      add     x9,x9,x15
++      add     x22,x22,x17                     // h+=Sigma0(a)
++      add     x9,x9,x14
++      ldr     x14,[sp,#24]
++      str     x1,[sp,#16]
++      ror     x16,x26,#14
++      add     x21,x21,x19                     // h+=K[i]
++      ror     x0,x11,#1
++      and     x17,x27,x26
++      ror     x15,x8,#19
++      bic     x19,x20,x26
++      ror     x1,x22,#28
++      add     x21,x21,x9                      // h+=X[i]
++      eor     x16,x16,x26,ror#18
++      eor     x0,x0,x11,ror#8
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x22,x23                     // a^b, b^c in next round
++      eor     x16,x16,x26,ror#41      // Sigma1(e)
++      eor     x1,x1,x22,ror#34
++      add     x21,x21,x17                     // h+=Ch(e,f,g)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      eor     x15,x15,x8,ror#61
++      eor     x0,x0,x11,lsr#7 // sigma0(X[i+1])
++      add     x21,x21,x16                     // h+=Sigma1(e)
++      eor     x28,x28,x23                     // Maj(a,b,c)
++      eor     x17,x1,x22,ror#39       // Sigma0(a)
++      eor     x15,x15,x8,lsr#6        // sigma1(X[i+14])
++      add     x10,x10,x3
++      add     x25,x25,x21                     // d+=h
++      add     x21,x21,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      add     x10,x10,x0
++      add     x21,x21,x17                     // h+=Sigma0(a)
++      add     x10,x10,x15
++      ldr     x15,[sp,#0]
++      str     x2,[sp,#24]
++      ror     x16,x25,#14
++      add     x20,x20,x28                     // h+=K[i]
++      ror     x1,x12,#1
++      and     x17,x26,x25
++      ror     x0,x9,#19
++      bic     x28,x27,x25
++      ror     x2,x21,#28
++      add     x20,x20,x10                     // h+=X[i]
++      eor     x16,x16,x25,ror#18
++      eor     x1,x1,x12,ror#8
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x21,x22                     // a^b, b^c in next round
++      eor     x16,x16,x25,ror#41      // Sigma1(e)
++      eor     x2,x2,x21,ror#34
++      add     x20,x20,x17                     // h+=Ch(e,f,g)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      eor     x0,x0,x9,ror#61
++      eor     x1,x1,x12,lsr#7 // sigma0(X[i+1])
++      add     x20,x20,x16                     // h+=Sigma1(e)
++      eor     x19,x19,x22                     // Maj(a,b,c)
++      eor     x17,x2,x21,ror#39       // Sigma0(a)
++      eor     x0,x0,x9,lsr#6  // sigma1(X[i+14])
++      add     x11,x11,x4
++      add     x24,x24,x20                     // d+=h
++      add     x20,x20,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      add     x11,x11,x1
++      add     x20,x20,x17                     // h+=Sigma0(a)
++      add     x11,x11,x0
++      ldr     x0,[sp,#8]
++      str     x3,[sp,#0]
++      ror     x16,x24,#14
++      add     x27,x27,x19                     // h+=K[i]
++      ror     x2,x13,#1
++      and     x17,x25,x24
++      ror     x1,x10,#19
++      bic     x19,x26,x24
++      ror     x3,x20,#28
++      add     x27,x27,x11                     // h+=X[i]
++      eor     x16,x16,x24,ror#18
++      eor     x2,x2,x13,ror#8
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x20,x21                     // a^b, b^c in next round
++      eor     x16,x16,x24,ror#41      // Sigma1(e)
++      eor     x3,x3,x20,ror#34
++      add     x27,x27,x17                     // h+=Ch(e,f,g)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      eor     x1,x1,x10,ror#61
++      eor     x2,x2,x13,lsr#7 // sigma0(X[i+1])
++      add     x27,x27,x16                     // h+=Sigma1(e)
++      eor     x28,x28,x21                     // Maj(a,b,c)
++      eor     x17,x3,x20,ror#39       // Sigma0(a)
++      eor     x1,x1,x10,lsr#6 // sigma1(X[i+14])
++      add     x12,x12,x5
++      add     x23,x23,x27                     // d+=h
++      add     x27,x27,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      add     x12,x12,x2
++      add     x27,x27,x17                     // h+=Sigma0(a)
++      add     x12,x12,x1
++      ldr     x1,[sp,#16]
++      str     x4,[sp,#8]
++      ror     x16,x23,#14
++      add     x26,x26,x28                     // h+=K[i]
++      ror     x3,x14,#1
++      and     x17,x24,x23
++      ror     x2,x11,#19
++      bic     x28,x25,x23
++      ror     x4,x27,#28
++      add     x26,x26,x12                     // h+=X[i]
++      eor     x16,x16,x23,ror#18
++      eor     x3,x3,x14,ror#8
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x27,x20                     // a^b, b^c in next round
++      eor     x16,x16,x23,ror#41      // Sigma1(e)
++      eor     x4,x4,x27,ror#34
++      add     x26,x26,x17                     // h+=Ch(e,f,g)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      eor     x2,x2,x11,ror#61
++      eor     x3,x3,x14,lsr#7 // sigma0(X[i+1])
++      add     x26,x26,x16                     // h+=Sigma1(e)
++      eor     x19,x19,x20                     // Maj(a,b,c)
++      eor     x17,x4,x27,ror#39       // Sigma0(a)
++      eor     x2,x2,x11,lsr#6 // sigma1(X[i+14])
++      add     x13,x13,x6
++      add     x22,x22,x26                     // d+=h
++      add     x26,x26,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      add     x13,x13,x3
++      add     x26,x26,x17                     // h+=Sigma0(a)
++      add     x13,x13,x2
++      ldr     x2,[sp,#24]
++      str     x5,[sp,#16]
++      ror     x16,x22,#14
++      add     x25,x25,x19                     // h+=K[i]
++      ror     x4,x15,#1
++      and     x17,x23,x22
++      ror     x3,x12,#19
++      bic     x19,x24,x22
++      ror     x5,x26,#28
++      add     x25,x25,x13                     // h+=X[i]
++      eor     x16,x16,x22,ror#18
++      eor     x4,x4,x15,ror#8
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x26,x27                     // a^b, b^c in next round
++      eor     x16,x16,x22,ror#41      // Sigma1(e)
++      eor     x5,x5,x26,ror#34
++      add     x25,x25,x17                     // h+=Ch(e,f,g)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      eor     x3,x3,x12,ror#61
++      eor     x4,x4,x15,lsr#7 // sigma0(X[i+1])
++      add     x25,x25,x16                     // h+=Sigma1(e)
++      eor     x28,x28,x27                     // Maj(a,b,c)
++      eor     x17,x5,x26,ror#39       // Sigma0(a)
++      eor     x3,x3,x12,lsr#6 // sigma1(X[i+14])
++      add     x14,x14,x7
++      add     x21,x21,x25                     // d+=h
++      add     x25,x25,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      add     x14,x14,x4
++      add     x25,x25,x17                     // h+=Sigma0(a)
++      add     x14,x14,x3
++      ldr     x3,[sp,#0]
++      str     x6,[sp,#24]
++      ror     x16,x21,#14
++      add     x24,x24,x28                     // h+=K[i]
++      ror     x5,x0,#1
++      and     x17,x22,x21
++      ror     x4,x13,#19
++      bic     x28,x23,x21
++      ror     x6,x25,#28
++      add     x24,x24,x14                     // h+=X[i]
++      eor     x16,x16,x21,ror#18
++      eor     x5,x5,x0,ror#8
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x25,x26                     // a^b, b^c in next round
++      eor     x16,x16,x21,ror#41      // Sigma1(e)
++      eor     x6,x6,x25,ror#34
++      add     x24,x24,x17                     // h+=Ch(e,f,g)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      eor     x4,x4,x13,ror#61
++      eor     x5,x5,x0,lsr#7  // sigma0(X[i+1])
++      add     x24,x24,x16                     // h+=Sigma1(e)
++      eor     x19,x19,x26                     // Maj(a,b,c)
++      eor     x17,x6,x25,ror#39       // Sigma0(a)
++      eor     x4,x4,x13,lsr#6 // sigma1(X[i+14])
++      add     x15,x15,x8
++      add     x20,x20,x24                     // d+=h
++      add     x24,x24,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      add     x15,x15,x5
++      add     x24,x24,x17                     // h+=Sigma0(a)
++      add     x15,x15,x4
++      ldr     x4,[sp,#8]
++      str     x7,[sp,#0]
++      ror     x16,x20,#14
++      add     x23,x23,x19                     // h+=K[i]
++      ror     x6,x1,#1
++      and     x17,x21,x20
++      ror     x5,x14,#19
++      bic     x19,x22,x20
++      ror     x7,x24,#28
++      add     x23,x23,x15                     // h+=X[i]
++      eor     x16,x16,x20,ror#18
++      eor     x6,x6,x1,ror#8
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x24,x25                     // a^b, b^c in next round
++      eor     x16,x16,x20,ror#41      // Sigma1(e)
++      eor     x7,x7,x24,ror#34
++      add     x23,x23,x17                     // h+=Ch(e,f,g)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      eor     x5,x5,x14,ror#61
++      eor     x6,x6,x1,lsr#7  // sigma0(X[i+1])
++      add     x23,x23,x16                     // h+=Sigma1(e)
++      eor     x28,x28,x25                     // Maj(a,b,c)
++      eor     x17,x7,x24,ror#39       // Sigma0(a)
++      eor     x5,x5,x14,lsr#6 // sigma1(X[i+14])
++      add     x0,x0,x9
++      add     x27,x27,x23                     // d+=h
++      add     x23,x23,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      add     x0,x0,x6
++      add     x23,x23,x17                     // h+=Sigma0(a)
++      add     x0,x0,x5
++      ldr     x5,[sp,#16]
++      str     x8,[sp,#8]
++      ror     x16,x27,#14
++      add     x22,x22,x28                     // h+=K[i]
++      ror     x7,x2,#1
++      and     x17,x20,x27
++      ror     x6,x15,#19
++      bic     x28,x21,x27
++      ror     x8,x23,#28
++      add     x22,x22,x0                      // h+=X[i]
++      eor     x16,x16,x27,ror#18
++      eor     x7,x7,x2,ror#8
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x23,x24                     // a^b, b^c in next round
++      eor     x16,x16,x27,ror#41      // Sigma1(e)
++      eor     x8,x8,x23,ror#34
++      add     x22,x22,x17                     // h+=Ch(e,f,g)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      eor     x6,x6,x15,ror#61
++      eor     x7,x7,x2,lsr#7  // sigma0(X[i+1])
++      add     x22,x22,x16                     // h+=Sigma1(e)
++      eor     x19,x19,x24                     // Maj(a,b,c)
++      eor     x17,x8,x23,ror#39       // Sigma0(a)
++      eor     x6,x6,x15,lsr#6 // sigma1(X[i+14])
++      add     x1,x1,x10
++      add     x26,x26,x22                     // d+=h
++      add     x22,x22,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      add     x1,x1,x7
++      add     x22,x22,x17                     // h+=Sigma0(a)
++      add     x1,x1,x6
++      ldr     x6,[sp,#24]
++      str     x9,[sp,#16]
++      ror     x16,x26,#14
++      add     x21,x21,x19                     // h+=K[i]
++      ror     x8,x3,#1
++      and     x17,x27,x26
++      ror     x7,x0,#19
++      bic     x19,x20,x26
++      ror     x9,x22,#28
++      add     x21,x21,x1                      // h+=X[i]
++      eor     x16,x16,x26,ror#18
++      eor     x8,x8,x3,ror#8
++      orr     x17,x17,x19                     // Ch(e,f,g)
++      eor     x19,x22,x23                     // a^b, b^c in next round
++      eor     x16,x16,x26,ror#41      // Sigma1(e)
++      eor     x9,x9,x22,ror#34
++      add     x21,x21,x17                     // h+=Ch(e,f,g)
++      and     x28,x28,x19                     // (b^c)&=(a^b)
++      eor     x7,x7,x0,ror#61
++      eor     x8,x8,x3,lsr#7  // sigma0(X[i+1])
++      add     x21,x21,x16                     // h+=Sigma1(e)
++      eor     x28,x28,x23                     // Maj(a,b,c)
++      eor     x17,x9,x22,ror#39       // Sigma0(a)
++      eor     x7,x7,x0,lsr#6  // sigma1(X[i+14])
++      add     x2,x2,x11
++      add     x25,x25,x21                     // d+=h
++      add     x21,x21,x28                     // h+=Maj(a,b,c)
++      ldr     x28,[x30],#8            // *K++, x19 in next round
++      add     x2,x2,x8
++      add     x21,x21,x17                     // h+=Sigma0(a)
++      add     x2,x2,x7
++      ldr     x7,[sp,#0]
++      str     x10,[sp,#24]
++      ror     x16,x25,#14
++      add     x20,x20,x28                     // h+=K[i]
++      ror     x9,x4,#1
++      and     x17,x26,x25
++      ror     x8,x1,#19
++      bic     x28,x27,x25
++      ror     x10,x21,#28
++      add     x20,x20,x2                      // h+=X[i]
++      eor     x16,x16,x25,ror#18
++      eor     x9,x9,x4,ror#8
++      orr     x17,x17,x28                     // Ch(e,f,g)
++      eor     x28,x21,x22                     // a^b, b^c in next round
++      eor     x16,x16,x25,ror#41      // Sigma1(e)
++      eor     x10,x10,x21,ror#34
++      add     x20,x20,x17                     // h+=Ch(e,f,g)
++      and     x19,x19,x28                     // (b^c)&=(a^b)
++      eor     x8,x8,x1,ror#61
++      eor     x9,x9,x4,lsr#7  // sigma0(X[i+1])
++      add     x20,x20,x16                     // h+=Sigma1(e)
++      eor     x19,x19,x22                     // Maj(a,b,c)
++      eor     x17,x10,x21,ror#39      // Sigma0(a)
++      eor     x8,x8,x1,lsr#6  // sigma1(X[i+14])
++      add     x3,x3,x12
++      add     x24,x24,x20                     // d+=h
++      add     x20,x20,x19                     // h+=Maj(a,b,c)
++      ldr     x19,[x30],#8            // *K++, x28 in next round
++      add     x3,x3,x9
++      add     x20,x20,x17                     // h+=Sigma0(a)
++      add     x3,x3,x8
++      cbnz    x19,.Loop_16_xx
++
++      ldp     x0,x2,[x29,#96]
++      ldr     x1,[x29,#112]
++      sub     x30,x30,#648            // rewind
++
++      ldp     x3,x4,[x0]
++      ldp     x5,x6,[x0,#2*8]
++      add     x1,x1,#14*8                     // advance input pointer
++      ldp     x7,x8,[x0,#4*8]
++      add     x20,x20,x3
++      ldp     x9,x10,[x0,#6*8]
++      add     x21,x21,x4
++      add     x22,x22,x5
++      add     x23,x23,x6
++      stp     x20,x21,[x0]
++      add     x24,x24,x7
++      add     x25,x25,x8
++      stp     x22,x23,[x0,#2*8]
++      add     x26,x26,x9
++      add     x27,x27,x10
++      cmp     x1,x2
++      stp     x24,x25,[x0,#4*8]
++      stp     x26,x27,[x0,#6*8]
++      b.ne    .Loop
++
++      ldp     x19,x20,[x29,#16]
++      add     sp,sp,#4*8
++      ldp     x21,x22,[x29,#32]
++      ldp     x23,x24,[x29,#48]
++      ldp     x25,x26,[x29,#64]
++      ldp     x27,x28,[x29,#80]
++      ldp     x29,x30,[sp],#128
++      ret
++.size sha512_block_data_order,.-sha512_block_data_order
++
++.align        6
++.type .LK512,%object
++.LK512:
++      .quad   0x428a2f98d728ae22,0x7137449123ef65cd
++      .quad   0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
++      .quad   0x3956c25bf348b538,0x59f111f1b605d019
++      .quad   0x923f82a4af194f9b,0xab1c5ed5da6d8118
++      .quad   0xd807aa98a3030242,0x12835b0145706fbe
++      .quad   0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
++      .quad   0x72be5d74f27b896f,0x80deb1fe3b1696b1
++      .quad   0x9bdc06a725c71235,0xc19bf174cf692694
++      .quad   0xe49b69c19ef14ad2,0xefbe4786384f25e3
++      .quad   0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
++      .quad   0x2de92c6f592b0275,0x4a7484aa6ea6e483
++      .quad   0x5cb0a9dcbd41fbd4,0x76f988da831153b5
++      .quad   0x983e5152ee66dfab,0xa831c66d2db43210
++      .quad   0xb00327c898fb213f,0xbf597fc7beef0ee4
++      .quad   0xc6e00bf33da88fc2,0xd5a79147930aa725
++      .quad   0x06ca6351e003826f,0x142929670a0e6e70
++      .quad   0x27b70a8546d22ffc,0x2e1b21385c26c926
++      .quad   0x4d2c6dfc5ac42aed,0x53380d139d95b3df
++      .quad   0x650a73548baf63de,0x766a0abb3c77b2a8
++      .quad   0x81c2c92e47edaee6,0x92722c851482353b
++      .quad   0xa2bfe8a14cf10364,0xa81a664bbc423001
++      .quad   0xc24b8b70d0f89791,0xc76c51a30654be30
++      .quad   0xd192e819d6ef5218,0xd69906245565a910
++      .quad   0xf40e35855771202a,0x106aa07032bbd1b8
++      .quad   0x19a4c116b8d2d0c8,0x1e376c085141ab53
++      .quad   0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
++      .quad   0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
++      .quad   0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
++      .quad   0x748f82ee5defb2fc,0x78a5636f43172f60
++      .quad   0x84c87814a1f0ab72,0x8cc702081a6439ec
++      .quad   0x90befffa23631e28,0xa4506cebde82bde9
++      .quad   0xbef9a3f7b2c67915,0xc67178f2e372532b
++      .quad   0xca273eceea26619c,0xd186b8c721c0c207
++      .quad   0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
++      .quad   0x06f067aa72176fba,0x0a637dc5a2c898a6
++      .quad   0x113f9804bef90dae,0x1b710b35131c471b
++      .quad   0x28db77f523047d84,0x32caab7b40c72493
++      .quad   0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
++      .quad   0x4cc5d4becb3e42b6,0x597f299cfc657e2a
++      .quad   0x5fcb6fab3ad6faec,0x6c44198c4a475817
++      .quad   0       // terminator
++.size .LK512,.-.LK512
++#ifndef       __KERNEL__
++.align        3
++.LOPENSSL_armcap_P:
++# ifdef       __ILP32__
++      .long   OPENSSL_armcap_P-.
++# else
++      .quad   OPENSSL_armcap_P-.
++# endif
++#endif
++.asciz        "SHA512 block transform for ARMv8, CRYPTOGAMS by <appro@openssl.org>"
++.align        2
++#ifndef       __KERNEL__
++.comm OPENSSL_armcap_P,4,4
++#endif
+--- a/arch/arm64/kernel/bpi.S
++++ b/arch/arm64/kernel/bpi.S
+@@ -17,6 +17,7 @@
+  */
+ #include <linux/linkage.h>
++#include <linux/arm-smccc.h>
+ .macro ventry target
+       .rept 31
+@@ -77,3 +78,22 @@ ENTRY(__psci_hyp_bp_inval_start)
+       ldp     x0, x1, [sp, #(16 * 8)]
+       add     sp, sp, #(8 * 18)
+ ENTRY(__psci_hyp_bp_inval_end)
++
++.macro smccc_workaround_1 inst
++      sub     sp, sp, #(8 * 4)
++      stp     x2, x3, [sp, #(8 * 0)]
++      stp     x0, x1, [sp, #(8 * 2)]
++      mov     w0, #ARM_SMCCC_ARCH_WORKAROUND_1
++      \inst   #0
++      ldp     x2, x3, [sp, #(8 * 0)]
++      ldp     x0, x1, [sp, #(8 * 2)]
++      add     sp, sp, #(8 * 4)
++.endm
++
++ENTRY(__smccc_workaround_1_smc_start)
++      smccc_workaround_1      smc
++ENTRY(__smccc_workaround_1_smc_end)
++
++ENTRY(__smccc_workaround_1_hvc_start)
++      smccc_workaround_1      hvc
++ENTRY(__smccc_workaround_1_hvc_end)
+--- a/arch/arm64/kernel/cpu_errata.c
++++ b/arch/arm64/kernel/cpu_errata.c
+@@ -54,6 +54,10 @@ DEFINE_PER_CPU_READ_MOSTLY(struct bp_har
+ #ifdef CONFIG_KVM
+ extern char __psci_hyp_bp_inval_start[], __psci_hyp_bp_inval_end[];
++extern char __smccc_workaround_1_smc_start[];
++extern char __smccc_workaround_1_smc_end[];
++extern char __smccc_workaround_1_hvc_start[];
++extern char __smccc_workaround_1_hvc_end[];
+ static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
+                               const char *hyp_vecs_end)
+@@ -96,8 +100,12 @@ static void __install_bp_hardening_cb(bp
+       spin_unlock(&bp_lock);
+ }
+ #else
+-#define __psci_hyp_bp_inval_start     NULL
+-#define __psci_hyp_bp_inval_end               NULL
++#define __psci_hyp_bp_inval_start             NULL
++#define __psci_hyp_bp_inval_end                       NULL
++#define __smccc_workaround_1_smc_start                NULL
++#define __smccc_workaround_1_smc_end          NULL
++#define __smccc_workaround_1_hvc_start                NULL
++#define __smccc_workaround_1_hvc_end          NULL
+ static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
+                                     const char *hyp_vecs_start,
+@@ -124,17 +132,75 @@ static void  install_bp_hardening_cb(con
+       __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end);
+ }
++#include <uapi/linux/psci.h>
++#include <linux/arm-smccc.h>
+ #include <linux/psci.h>
++static void call_smc_arch_workaround_1(void)
++{
++      arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
++}
++
++static void call_hvc_arch_workaround_1(void)
++{
++      arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
++}
++
++static bool check_smccc_arch_workaround_1(const struct arm64_cpu_capabilities *entry)
++{
++      bp_hardening_cb_t cb;
++      void *smccc_start, *smccc_end;
++      struct arm_smccc_res res;
++
++      if (!entry->matches(entry, SCOPE_LOCAL_CPU))
++              return false;
++
++      if (psci_ops.smccc_version == SMCCC_VERSION_1_0)
++              return false;
++
++      switch (psci_ops.conduit) {
++      case PSCI_CONDUIT_HVC:
++              arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
++                                ARM_SMCCC_ARCH_WORKAROUND_1, &res);
++              if (res.a0)
++                      return false;
++              cb = call_hvc_arch_workaround_1;
++              smccc_start = __smccc_workaround_1_hvc_start;
++              smccc_end = __smccc_workaround_1_hvc_end;
++              break;
++
++      case PSCI_CONDUIT_SMC:
++              arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
++                                ARM_SMCCC_ARCH_WORKAROUND_1, &res);
++              if (res.a0)
++                      return false;
++              cb = call_smc_arch_workaround_1;
++              smccc_start = __smccc_workaround_1_smc_start;
++              smccc_end = __smccc_workaround_1_smc_end;
++              break;
++
++      default:
++              return false;
++      }
++
++      install_bp_hardening_cb(entry, cb, smccc_start, smccc_end);
++
++      return true;
++}
++
+ static int enable_psci_bp_hardening(void *data)
+ {
+       const struct arm64_cpu_capabilities *entry = data;
+-      if (psci_ops.get_version)
++      if (psci_ops.get_version) {
++              if (check_smccc_arch_workaround_1(entry))
++                      return 0;
++
+               install_bp_hardening_cb(entry,
+                                      (bp_hardening_cb_t)psci_ops.get_version,
+                                      __psci_hyp_bp_inval_start,
+                                      __psci_hyp_bp_inval_end);
++      }
+       return 0;
+ }
diff --git a/queue-4.9/arm64-add-skeleton-to-harden-the-branch-predictor-against-aliasing-attacks.patch b/queue-4.9/arm64-add-skeleton-to-harden-the-branch-predictor-against-aliasing-attacks.patch
new file mode 100644 (file)
index 0000000..f090d34
--- /dev/null
@@ -0,0 +1,373 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:13 +0100
+Subject: [PATCH v4.9.y 17/42] arm64: Add skeleton to harden the branch predictor against aliasing attacks
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-18-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 0f15adbb2861ce6f75ccfc5a92b19eae0ef327d0 upstream.
+
+Aliasing attacks against CPU branch predictors can allow an attacker to
+redirect speculative control flow on some CPUs and potentially divulge
+information from one context to another.
+
+This patch adds initial skeleton code behind a new Kconfig option to
+enable implementation-specific mitigations against these attacks for
+CPUs that are affected.
+
+Co-developed-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+[v4.9: copy bp hardening cb via text mapping]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/Kconfig               |   17 ++++++++
+ arch/arm64/include/asm/cpucaps.h |    3 +
+ arch/arm64/include/asm/mmu.h     |   39 ++++++++++++++++++++
+ arch/arm64/include/asm/sysreg.h  |    2 +
+ arch/arm64/kernel/Makefile       |    4 ++
+ arch/arm64/kernel/bpi.S          |   55 ++++++++++++++++++++++++++++
+ arch/arm64/kernel/cpu_errata.c   |   74 +++++++++++++++++++++++++++++++++++++++
+ arch/arm64/kernel/cpufeature.c   |    3 +
+ arch/arm64/kernel/entry.S        |    8 ++--
+ arch/arm64/mm/context.c          |    2 +
+ arch/arm64/mm/fault.c            |   17 ++++++++
+ 11 files changed, 219 insertions(+), 5 deletions(-)
+ create mode 100644 arch/arm64/kernel/bpi.S
+
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -745,6 +745,23 @@ config UNMAP_KERNEL_AT_EL0
+         If unsure, say Y.
++config HARDEN_BRANCH_PREDICTOR
++      bool "Harden the branch predictor against aliasing attacks" if EXPERT
++      default y
++      help
++        Speculation attacks against some high-performance processors rely on
++        being able to manipulate the branch predictor for a victim context by
++        executing aliasing branches in the attacker context.  Such attacks
++        can be partially mitigated against by clearing internal branch
++        predictor state and limiting the prediction logic in some situations.
++
++        This config option will take CPU-specific actions to harden the
++        branch predictor against aliasing attacks and may rely on specific
++        instruction sequences or control bits being set by the system
++        firmware.
++
++        If unsure, say Y.
++
+ menuconfig ARMV8_DEPRECATED
+       bool "Emulate deprecated/obsolete ARMv8 instructions"
+       depends on COMPAT
+--- a/arch/arm64/include/asm/cpucaps.h
++++ b/arch/arm64/include/asm/cpucaps.h
+@@ -35,7 +35,8 @@
+ #define ARM64_HYP_OFFSET_LOW                  14
+ #define ARM64_MISMATCHED_CACHE_LINE_SIZE      15
+ #define ARM64_UNMAP_KERNEL_AT_EL0             16
++#define ARM64_HARDEN_BRANCH_PREDICTOR         17
+-#define ARM64_NCAPS                           17
++#define ARM64_NCAPS                           18
+ #endif /* __ASM_CPUCAPS_H */
+--- a/arch/arm64/include/asm/mmu.h
++++ b/arch/arm64/include/asm/mmu.h
+@@ -20,6 +20,8 @@
+ #ifndef __ASSEMBLY__
++#include <linux/percpu.h>
++
+ typedef struct {
+       atomic64_t      id;
+       void            *vdso;
+@@ -38,6 +40,43 @@ static inline bool arm64_kernel_unmapped
+              cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0);
+ }
++typedef void (*bp_hardening_cb_t)(void);
++
++struct bp_hardening_data {
++      int                     hyp_vectors_slot;
++      bp_hardening_cb_t       fn;
++};
++
++#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
++extern char __bp_harden_hyp_vecs_start[], __bp_harden_hyp_vecs_end[];
++
++DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
++
++static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
++{
++      return this_cpu_ptr(&bp_hardening_data);
++}
++
++static inline void arm64_apply_bp_hardening(void)
++{
++      struct bp_hardening_data *d;
++
++      if (!cpus_have_cap(ARM64_HARDEN_BRANCH_PREDICTOR))
++              return;
++
++      d = arm64_get_bp_hardening_data();
++      if (d->fn)
++              d->fn();
++}
++#else
++static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void)
++{
++      return NULL;
++}
++
++static inline void arm64_apply_bp_hardening(void)     { }
++#endif        /* CONFIG_HARDEN_BRANCH_PREDICTOR */
++
+ extern void paging_init(void);
+ extern void bootmem_init(void);
+ extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt);
+--- a/arch/arm64/include/asm/sysreg.h
++++ b/arch/arm64/include/asm/sysreg.h
+@@ -118,6 +118,8 @@
+ /* id_aa64pfr0 */
+ #define ID_AA64PFR0_CSV3_SHIFT                60
++#define ID_AA64PFR0_CSV2_SHIFT                56
++#define ID_AA64PFR0_SVE_SHIFT         32
+ #define ID_AA64PFR0_GIC_SHIFT         24
+ #define ID_AA64PFR0_ASIMD_SHIFT               20
+ #define ID_AA64PFR0_FP_SHIFT          16
+--- a/arch/arm64/kernel/Makefile
++++ b/arch/arm64/kernel/Makefile
+@@ -51,6 +51,10 @@ arm64-obj-$(CONFIG_HIBERNATION)             += hibe
+ arm64-obj-$(CONFIG_KEXEC)             += machine_kexec.o relocate_kernel.o    \
+                                          cpu-reset.o
++ifeq ($(CONFIG_KVM),y)
++arm64-obj-$(CONFIG_HARDEN_BRANCH_PREDICTOR)   += bpi.o
++endif
++
+ obj-y                                 += $(arm64-obj-y) vdso/ probes/
+ obj-m                                 += $(arm64-obj-m)
+ head-y                                        := head.o
+--- /dev/null
++++ b/arch/arm64/kernel/bpi.S
+@@ -0,0 +1,55 @@
++/*
++ * Contains CPU specific branch predictor invalidation sequences
++ *
++ * Copyright (C) 2018 ARM Ltd.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ *
++ * This program is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
++ */
++
++#include <linux/linkage.h>
++
++.macro ventry target
++      .rept 31
++      nop
++      .endr
++      b       \target
++.endm
++
++.macro vectors target
++      ventry \target + 0x000
++      ventry \target + 0x080
++      ventry \target + 0x100
++      ventry \target + 0x180
++
++      ventry \target + 0x200
++      ventry \target + 0x280
++      ventry \target + 0x300
++      ventry \target + 0x380
++
++      ventry \target + 0x400
++      ventry \target + 0x480
++      ventry \target + 0x500
++      ventry \target + 0x580
++
++      ventry \target + 0x600
++      ventry \target + 0x680
++      ventry \target + 0x700
++      ventry \target + 0x780
++.endm
++
++      .align  11
++ENTRY(__bp_harden_hyp_vecs_start)
++      .rept 4
++      vectors __kvm_hyp_vector
++      .endr
++ENTRY(__bp_harden_hyp_vecs_end)
+--- a/arch/arm64/kernel/cpu_errata.c
++++ b/arch/arm64/kernel/cpu_errata.c
+@@ -46,6 +46,80 @@ static int cpu_enable_trap_ctr_access(vo
+       return 0;
+ }
++#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
++#include <asm/mmu_context.h>
++#include <asm/cacheflush.h>
++
++DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
++
++#ifdef CONFIG_KVM
++static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
++                              const char *hyp_vecs_end)
++{
++      void *dst = __bp_harden_hyp_vecs_start + slot * SZ_2K;
++      int i;
++
++      for (i = 0; i < SZ_2K; i += 0x80)
++              memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start);
++
++      flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K);
++}
++
++static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
++                                    const char *hyp_vecs_start,
++                                    const char *hyp_vecs_end)
++{
++      static int last_slot = -1;
++      static DEFINE_SPINLOCK(bp_lock);
++      int cpu, slot = -1;
++
++      spin_lock(&bp_lock);
++      for_each_possible_cpu(cpu) {
++              if (per_cpu(bp_hardening_data.fn, cpu) == fn) {
++                      slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu);
++                      break;
++              }
++      }
++
++      if (slot == -1) {
++              last_slot++;
++              BUG_ON(((__bp_harden_hyp_vecs_end - __bp_harden_hyp_vecs_start)
++                      / SZ_2K) <= last_slot);
++              slot = last_slot;
++              __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end);
++      }
++
++      __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot);
++      __this_cpu_write(bp_hardening_data.fn, fn);
++      spin_unlock(&bp_lock);
++}
++#else
++static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
++                                    const char *hyp_vecs_start,
++                                    const char *hyp_vecs_end)
++{
++      __this_cpu_write(bp_hardening_data.fn, fn);
++}
++#endif        /* CONFIG_KVM */
++
++static void  install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry,
++                                   bp_hardening_cb_t fn,
++                                   const char *hyp_vecs_start,
++                                   const char *hyp_vecs_end)
++{
++      u64 pfr0;
++
++      if (!entry->matches(entry, SCOPE_LOCAL_CPU))
++              return;
++
++      pfr0 = read_cpuid(ID_AA64PFR0_EL1);
++      if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT))
++              return;
++
++      __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end);
++}
++#endif        /* CONFIG_HARDEN_BRANCH_PREDICTOR */
++
+ #define MIDR_RANGE(model, min, max) \
+       .def_scope = SCOPE_LOCAL_CPU, \
+       .matches = is_affected_midr_range, \
+--- a/arch/arm64/kernel/cpufeature.c
++++ b/arch/arm64/kernel/cpufeature.c
+@@ -94,7 +94,8 @@ static const struct arm64_ftr_bits ftr_i
+ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
+       ARM64_FTR_BITS(FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0),
+-      ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 28, 0),
++      ARM64_FTR_BITS(FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0),
++      ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 32, 24, 0),
+       ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, 28, 4, 0),
+       ARM64_FTR_BITS(FTR_STRICT, FTR_EXACT, ID_AA64PFR0_GIC_SHIFT, 4, 0),
+       S_ARM64_FTR_BITS(FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI),
+--- a/arch/arm64/kernel/entry.S
++++ b/arch/arm64/kernel/entry.S
+@@ -589,13 +589,15 @@ el0_ia:
+        * Instruction abort handling
+        */
+       mrs     x26, far_el1
+-      // enable interrupts before calling the main handler
+-      enable_dbg_and_irq
++      msr     daifclr, #(8 | 4 | 1)
++#ifdef CONFIG_TRACE_IRQFLAGS
++      bl      trace_hardirqs_off
++#endif
+       ct_user_exit
+       mov     x0, x26
+       mov     x1, x25
+       mov     x2, sp
+-      bl      do_mem_abort
++      bl      do_el0_ia_bp_hardening
+       b       ret_to_user
+ el0_fpsimd_acc:
+       /*
+--- a/arch/arm64/mm/context.c
++++ b/arch/arm64/mm/context.c
+@@ -240,6 +240,8 @@ asmlinkage void post_ttbr_update_workaro
+                       "ic iallu; dsb nsh; isb",
+                       ARM64_WORKAROUND_CAVIUM_27456,
+                       CONFIG_CAVIUM_ERRATUM_27456));
++
++      arm64_apply_bp_hardening();
+ }
+ static int asids_init(void)
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -590,6 +590,23 @@ asmlinkage void __exception do_mem_abort
+       arm64_notify_die("", regs, &info, esr);
+ }
++asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr,
++                                                 unsigned int esr,
++                                                 struct pt_regs *regs)
++{
++      /*
++       * We've taken an instruction abort from userspace and not yet
++       * re-enabled IRQs. If the address is a kernel address, apply
++       * BP hardening prior to enabling IRQs and pre-emption.
++       */
++      if (addr > TASK_SIZE)
++              arm64_apply_bp_hardening();
++
++      local_irq_enable();
++      do_mem_abort(addr, esr, regs);
++}
++
++
+ /*
+  * Handle stack alignment exceptions.
+  */
diff --git a/queue-4.9/arm64-barrier-add-csdb-macros-to-control-data-value-prediction.patch b/queue-4.9/arm64-barrier-add-csdb-macros-to-control-data-value-prediction.patch
new file mode 100644 (file)
index 0000000..3ffc0a2
--- /dev/null
@@ -0,0 +1,57 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:10:57 +0100
+Subject: [PATCH v4.9.y 01/42] arm64: barrier: Add CSDB macros to control data-value prediction
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-2-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 669474e772b952b14f4de4845a1558fd4c0414a4 upstream.
+
+For CPUs capable of data value prediction, CSDB waits for any outstanding
+predictions to architecturally resolve before allowing speculative execution
+to continue. Provide macros to expose it to the arch code.
+
+Reviewed-by: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/assembler.h |    7 +++++++
+ arch/arm64/include/asm/barrier.h   |    2 ++
+ 2 files changed, 9 insertions(+)
+
+--- a/arch/arm64/include/asm/assembler.h
++++ b/arch/arm64/include/asm/assembler.h
+@@ -87,6 +87,13 @@
+       .endm
+ /*
++ * Value prediction barrier
++ */
++      .macro  csdb
++      hint    #20
++      .endm
++
++/*
+  * NOP sequence
+  */
+       .macro  nops, num
+--- a/arch/arm64/include/asm/barrier.h
++++ b/arch/arm64/include/asm/barrier.h
+@@ -31,6 +31,8 @@
+ #define dmb(opt)      asm volatile("dmb " #opt : : : "memory")
+ #define dsb(opt)      asm volatile("dsb " #opt : : : "memory")
++#define csdb()                asm volatile("hint #20" : : : "memory")
++
+ #define mb()          dsb(sy)
+ #define rmb()         dsb(ld)
+ #define wmb()         dsb(st)
diff --git a/queue-4.9/arm64-branch-predictor-hardening-for-cavium-thunderx2.patch b/queue-4.9/arm64-branch-predictor-hardening-for-cavium-thunderx2.patch
new file mode 100644 (file)
index 0000000..2d0d590
--- /dev/null
@@ -0,0 +1,49 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:22 +0100
+Subject: [PATCH v4.9.y 26/42] arm64: Branch predictor hardening for Cavium ThunderX2
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-27-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Jayachandran C <jnair@caviumnetworks.com>
+
+commit f3d795d9b360523beca6d13ba64c2c532f601149 upstream.
+
+Use PSCI based mitigation for speculative execution attacks targeting
+the branch predictor. We use the same mechanism as the one used for
+Cortex-A CPUs, we expect the PSCI version call to have a side effect
+of clearing the BTBs.
+
+Acked-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Jayachandran C <jnair@caviumnetworks.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kernel/cpu_errata.c |   10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/arch/arm64/kernel/cpu_errata.c
++++ b/arch/arm64/kernel/cpu_errata.c
+@@ -252,6 +252,16 @@ const struct arm64_cpu_capabilities arm6
+               MIDR_ALL_VERSIONS(MIDR_CORTEX_A75),
+               .enable = enable_psci_bp_hardening,
+       },
++      {
++              .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
++              MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
++              .enable = enable_psci_bp_hardening,
++      },
++      {
++              .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
++              MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
++              .enable = enable_psci_bp_hardening,
++      },
+ #endif
+       {
+       }
diff --git a/queue-4.9/arm64-cpu_errata-allow-an-erratum-to-be-match-for-all-revisions-of-a-core.patch b/queue-4.9/arm64-cpu_errata-allow-an-erratum-to-be-match-for-all-revisions-of-a-core.patch
new file mode 100644 (file)
index 0000000..cb77822
--- /dev/null
@@ -0,0 +1,50 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:20 +0100
+Subject: [PATCH v4.9.y 24/42] arm64: cpu_errata: Allow an erratum to be match for all revisions of a core
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-25-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 06f1494f837da8997d670a1ba87add7963b08922 upstream.
+
+Some minor erratum may not be fixed in further revisions of a core,
+leading to a situation where the workaround needs to be updated each
+time an updated core is released.
+
+Introduce a MIDR_ALL_VERSIONS match helper that will work for all
+versions of that MIDR, once and for all.
+
+Acked-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Mark Rutland <mark.rutland@arm.com>
+Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
+Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kernel/cpu_errata.c |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/arch/arm64/kernel/cpu_errata.c
++++ b/arch/arm64/kernel/cpu_errata.c
+@@ -127,6 +127,13 @@ static void  install_bp_hardening_cb(con
+       .midr_range_min = min, \
+       .midr_range_max = max
++#define MIDR_ALL_VERSIONS(model) \
++      .def_scope = SCOPE_LOCAL_CPU, \
++      .matches = is_affected_midr_range, \
++      .midr_model = model, \
++      .midr_range_min = 0, \
++      .midr_range_max = (MIDR_VARIANT_MASK | MIDR_REVISION_MASK)
++
+ const struct arm64_cpu_capabilities arm64_errata[] = {
+ #if   defined(CONFIG_ARM64_ERRATUM_826319) || \
+       defined(CONFIG_ARM64_ERRATUM_827319) || \
diff --git a/queue-4.9/arm64-cpufeature-__this_cpu_has_cap-shouldn-t-stop-early.patch b/queue-4.9/arm64-cpufeature-__this_cpu_has_cap-shouldn-t-stop-early.patch
new file mode 100644 (file)
index 0000000..62414fa
--- /dev/null
@@ -0,0 +1,50 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:07 +0100
+Subject: [PATCH v4.9.y 11/42] arm64: cpufeature: __this_cpu_has_cap() shouldn't stop early
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-12-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: James Morse <james.morse@arm.com>
+
+commit edf298cfce47ab7279d03b5203ae2ef3a58e49db upstream.
+
+this_cpu_has_cap() tests caps->desc not caps->matches, so it stops
+walking the list when it finds a 'silent' feature, instead of
+walking to the end of the list.
+
+Prior to v4.6's 644c2ae198412 ("arm64: cpufeature: Test 'matches' pointer
+to find the end of the list") we always tested desc to find the end of
+a capability list. This was changed for dubious things like PAN_NOT_UAO.
+v4.7's e3661b128e53e ("arm64: Allow a capability to be checked on
+single CPU") added this_cpu_has_cap() using the old desc style test.
+
+CC: Suzuki K Poulose <suzuki.poulose@arm.com>
+Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
+Acked-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: James Morse <james.morse@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kernel/cpufeature.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/arch/arm64/kernel/cpufeature.c
++++ b/arch/arm64/kernel/cpufeature.c
+@@ -1024,9 +1024,8 @@ static bool __this_cpu_has_cap(const str
+       if (WARN_ON(preemptible()))
+               return false;
+-      for (caps = cap_array; caps->desc; caps++)
++      for (caps = cap_array; caps->matches; caps++)
+               if (caps->capability == cap &&
+-                  caps->matches &&
+                   caps->matches(caps, SCOPE_LOCAL_CPU))
+                       return true;
+       return false;
diff --git a/queue-4.9/arm64-cpufeature-pass-capability-structure-to-enable-callback.patch b/queue-4.9/arm64-cpufeature-pass-capability-structure-to-enable-callback.patch
new file mode 100644 (file)
index 0000000..9f370e9
--- /dev/null
@@ -0,0 +1,52 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:09 +0100
+Subject: [PATCH v4.9.y 13/42] arm64: cpufeature: Pass capability structure to ->enable callback
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-14-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 0a0d111d40fd1dc588cc590fab6b55d86ddc71d3 upstream.
+
+In order to invoke the CPU capability ->matches callback from the ->enable
+callback for applying local-CPU workarounds, we need a handle on the
+capability structure.
+
+This patch passes a pointer to the capability structure to the ->enable
+callback.
+
+Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kernel/cpufeature.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/arch/arm64/kernel/cpufeature.c
++++ b/arch/arm64/kernel/cpufeature.c
+@@ -1058,7 +1058,7 @@ void __init enable_cpu_capabilities(cons
+                        * uses an IPI, giving us a PSTATE that disappears when
+                        * we return.
+                        */
+-                      stop_machine(caps->enable, NULL, cpu_online_mask);
++                      stop_machine(caps->enable, (void *)caps, cpu_online_mask);
+ }
+ /*
+@@ -1115,7 +1115,7 @@ verify_local_cpu_features(const struct a
+                       cpu_die_early();
+               }
+               if (caps->enable)
+-                      caps->enable(NULL);
++                      caps->enable((void *)caps);
+       }
+ }
diff --git a/queue-4.9/arm64-cputype-add-missing-midr-values-for-cortex-a72-and-cortex-a75.patch b/queue-4.9/arm64-cputype-add-missing-midr-values-for-cortex-a72-and-cortex-a75.patch
new file mode 100644 (file)
index 0000000..5550d96
--- /dev/null
@@ -0,0 +1,50 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:19 +0100
+Subject: [PATCH v4.9.y 23/42] arm64: cputype: Add missing MIDR values for Cortex-A72 and Cortex-A75
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-24-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit a65d219fe5dc7887fd5ca04c2ac3e9a34feb8dfc upstream.
+
+Hook up MIDR values for the Cortex-A72 and Cortex-A75 CPUs, since they
+will soon need MIDR matches for hardening the branch predictor.
+
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/cputype.h |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/arch/arm64/include/asm/cputype.h
++++ b/arch/arm64/include/asm/cputype.h
+@@ -75,7 +75,10 @@
+ #define ARM_CPU_PART_AEM_V8           0xD0F
+ #define ARM_CPU_PART_FOUNDATION               0xD00
+ #define ARM_CPU_PART_CORTEX_A57               0xD07
++#define ARM_CPU_PART_CORTEX_A72               0xD08
+ #define ARM_CPU_PART_CORTEX_A53               0xD03
++#define ARM_CPU_PART_CORTEX_A73               0xD09
++#define ARM_CPU_PART_CORTEX_A75               0xD0A
+ #define APM_CPU_PART_POTENZA          0x000
+@@ -87,6 +90,9 @@
+ #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
+ #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
++#define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72)
++#define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73)
++#define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75)
+ #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
+ #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
+ #define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2)
diff --git a/queue-4.9/arm64-entry-apply-bp-hardening-for-high-priority-synchronous-exceptions.patch b/queue-4.9/arm64-entry-apply-bp-hardening-for-high-priority-synchronous-exceptions.patch
new file mode 100644 (file)
index 0000000..0d0d91d
--- /dev/null
@@ -0,0 +1,71 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:17 +0100
+Subject: [PATCH v4.9.y 21/42] arm64: entry: Apply BP hardening for high-priority synchronous exceptions
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-22-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 5dfc6ed27710c42cbc15db5c0d4475699991da0a upstream.
+
+Software-step and PC alignment fault exceptions have higher priority than
+instruction abort exceptions, so apply the BP hardening hooks there too
+if the user PC appears to reside in kernel space.
+
+Reported-by: Dan Hettena <dhettena@nvidia.com>
+Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kernel/entry.S |    6 ++++--
+ arch/arm64/mm/fault.c     |    9 +++++++++
+ 2 files changed, 13 insertions(+), 2 deletions(-)
+
+--- a/arch/arm64/kernel/entry.S
++++ b/arch/arm64/kernel/entry.S
+@@ -624,8 +624,10 @@ el0_sp_pc:
+        * Stack or PC alignment exception handling
+        */
+       mrs     x26, far_el1
+-      // enable interrupts before calling the main handler
+-      enable_dbg_and_irq
++      enable_dbg
++#ifdef CONFIG_TRACE_IRQFLAGS
++      bl      trace_hardirqs_off
++#endif
+       ct_user_exit
+       mov     x0, x26
+       mov     x1, x25
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -617,6 +617,12 @@ asmlinkage void __exception do_sp_pc_abo
+       struct siginfo info;
+       struct task_struct *tsk = current;
++      if (user_mode(regs)) {
++              if (instruction_pointer(regs) > TASK_SIZE)
++                      arm64_apply_bp_hardening();
++              local_irq_enable();
++      }
++
+       if (show_unhandled_signals && unhandled_signal(tsk, SIGBUS))
+               pr_info_ratelimited("%s[%d]: %s exception: pc=%p sp=%p\n",
+                                   tsk->comm, task_pid_nr(tsk),
+@@ -676,6 +682,9 @@ asmlinkage int __exception do_debug_exce
+       if (interrupts_enabled(regs))
+               trace_hardirqs_off();
++      if (user_mode(regs) && instruction_pointer(regs) > TASK_SIZE)
++              arm64_apply_bp_hardening();
++
+       if (!inf->fn(addr, esr, regs)) {
+               rv = 1;
+       } else {
diff --git a/queue-4.9/arm64-entry-apply-bp-hardening-for-suspicious-interrupts-from-el0.patch b/queue-4.9/arm64-entry-apply-bp-hardening-for-suspicious-interrupts-from-el0.patch
new file mode 100644 (file)
index 0000000..4e6b65f
--- /dev/null
@@ -0,0 +1,63 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:18 +0100
+Subject: [PATCH v4.9.y 22/42] arm64: entry: Apply BP hardening for suspicious interrupts from EL0
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-23-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 30d88c0e3ace625a92eead9ca0ad94093a8f59fe upstream.
+
+It is possible to take an IRQ from EL0 following a branch to a kernel
+address in such a way that the IRQ is prioritised over the instruction
+abort. Whilst an attacker would need to get the stars to align here,
+it might be sufficient with enough calibration so perform BP hardening
+in the rare case that we see a kernel address in the ELR when handling
+an IRQ from EL0.
+
+Reported-by: Dan Hettena <dhettena@nvidia.com>
+Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kernel/entry.S |    5 +++++
+ arch/arm64/mm/fault.c     |    6 ++++++
+ 2 files changed, 11 insertions(+)
+
+--- a/arch/arm64/kernel/entry.S
++++ b/arch/arm64/kernel/entry.S
+@@ -686,6 +686,11 @@ el0_irq_naked:
+ #endif
+       ct_user_exit
++#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
++      tbz     x22, #55, 1f
++      bl      do_el0_irq_bp_hardening
++1:
++#endif
+       irq_handler
+ #ifdef CONFIG_TRACE_IRQFLAGS
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -590,6 +590,12 @@ asmlinkage void __exception do_mem_abort
+       arm64_notify_die("", regs, &info, esr);
+ }
++asmlinkage void __exception do_el0_irq_bp_hardening(void)
++{
++      /* PC has already been checked in entry.S */
++      arm64_apply_bp_hardening();
++}
++
+ asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr,
+                                                  unsigned int esr,
+                                                  struct pt_regs *regs)
diff --git a/queue-4.9/arm64-entry-ensure-branch-through-syscall-table-is-bounded-under-speculation.patch b/queue-4.9/arm64-entry-ensure-branch-through-syscall-table-is-bounded-under-speculation.patch
new file mode 100644 (file)
index 0000000..6a46f75
--- /dev/null
@@ -0,0 +1,63 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:02 +0100
+Subject: [PATCH v4.9.y 06/42] arm64: entry: Ensure branch through syscall table is bounded under speculation
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-7-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 6314d90e64936c584f300a52ef173603fb2461b5 upstream.
+
+In a similar manner to array_index_mask_nospec, this patch introduces an
+assembly macro (mask_nospec64) which can be used to bound a value under
+speculation. This macro is then used to ensure that the indirect branch
+through the syscall table is bounded under speculation, with out-of-range
+addresses speculating as calls to sys_io_setup (0).
+
+Reviewed-by: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+[v4.9: use existing scno & sc_nr definitions]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/assembler.h |   11 +++++++++++
+ arch/arm64/kernel/entry.S          |    1 +
+ 2 files changed, 12 insertions(+)
+
+--- a/arch/arm64/include/asm/assembler.h
++++ b/arch/arm64/include/asm/assembler.h
+@@ -94,6 +94,17 @@
+       .endm
+ /*
++ * Sanitise a 64-bit bounded index wrt speculation, returning zero if out
++ * of bounds.
++ */
++      .macro  mask_nospec64, idx, limit, tmp
++      sub     \tmp, \idx, \limit
++      bic     \tmp, \tmp, \idx
++      and     \idx, \idx, \tmp, asr #63
++      csdb
++      .endm
++
++/*
+  * NOP sequence
+  */
+       .macro  nops, num
+--- a/arch/arm64/kernel/entry.S
++++ b/arch/arm64/kernel/entry.S
+@@ -795,6 +795,7 @@ el0_svc_naked:                                     // compat entry point
+       b.ne    __sys_trace
+       cmp     scno, sc_nr                     // check upper syscall limit
+       b.hs    ni_sys
++      mask_nospec64 scno, sc_nr, x19  // enforce bounds for syscall number
+       ldr     x16, [stbl, scno, lsl #3]       // address in the syscall table
+       blr     x16                             // call sys_* routine
+       b       ret_fast_syscall
diff --git a/queue-4.9/arm64-factor-out-ttbr0_el1-post-update-workaround-into-a-specific-asm-macro.patch b/queue-4.9/arm64-factor-out-ttbr0_el1-post-update-workaround-into-a-specific-asm-macro.patch
new file mode 100644 (file)
index 0000000..2b35473
--- /dev/null
@@ -0,0 +1,68 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:11 +0100
+Subject: [PATCH v4.9.y 15/42] arm64: Factor out TTBR0_EL1 post-update workaround into a specific asm macro
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-16-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Catalin Marinas <catalin.marinas@arm.com>
+
+commit f33bcf03e6079668da6bf4eec4a7dcf9289131d0 upstream.
+
+This patch takes the errata workaround code out of cpu_do_switch_mm into
+a dedicated post_ttbr0_update_workaround macro which will be reused in a
+subsequent patch.
+
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: James Morse <james.morse@arm.com>
+Cc: Kees Cook <keescook@chromium.org>
+Reviewed-by: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/assembler.h |   14 ++++++++++++++
+ arch/arm64/mm/proc.S               |    6 +-----
+ 2 files changed, 15 insertions(+), 5 deletions(-)
+
+--- a/arch/arm64/include/asm/assembler.h
++++ b/arch/arm64/include/asm/assembler.h
+@@ -434,4 +434,18 @@ alternative_endif
+       .macro  pte_to_phys, phys, pte
+       and     \phys, \pte, #(((1 << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
+       .endm
++
++/*
++ * Errata workaround post TTBR0_EL1 update.
++ */
++      .macro  post_ttbr0_update_workaround
++#ifdef CONFIG_CAVIUM_ERRATUM_27456
++alternative_if ARM64_WORKAROUND_CAVIUM_27456
++      ic      iallu
++      dsb     nsh
++      isb
++alternative_else_nop_endif
++#endif
++      .endm
++
+ #endif        /* __ASM_ASSEMBLER_H */
+--- a/arch/arm64/mm/proc.S
++++ b/arch/arm64/mm/proc.S
+@@ -139,11 +139,7 @@ ENTRY(cpu_do_switch_mm)
+       isb
+       msr     ttbr0_el1, x0                   // now update TTBR0
+       isb
+-alternative_if ARM64_WORKAROUND_CAVIUM_27456
+-      ic      iallu
+-      dsb     nsh
+-      isb
+-alternative_else_nop_endif
++      post_ttbr0_update_workaround
+       ret
+ ENDPROC(cpu_do_switch_mm)
diff --git a/queue-4.9/arm64-implement-array_index_mask_nospec.patch b/queue-4.9/arm64-implement-array_index_mask_nospec.patch
new file mode 100644 (file)
index 0000000..c594224
--- /dev/null
@@ -0,0 +1,64 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:10:58 +0100
+Subject: [PATCH v4.9.y 02/42] arm64: Implement array_index_mask_nospec()
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-3-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Robin Murphy <robin.murphy@arm.com>
+
+commit 022620eed3d0bc4bf2027326f599f5ad71c2ea3f upstream.
+
+Provide an optimised, assembly implementation of array_index_mask_nospec()
+for arm64 so that the compiler is not in a position to transform the code
+in ways which affect its ability to inhibit speculation (e.g. by introducing
+conditional branches).
+
+This is similar to the sequence used by x86, modulo architectural differences
+in the carry/borrow flags.
+
+Reviewed-by: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Robin Murphy <robin.murphy@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/barrier.h |   21 +++++++++++++++++++++
+ 1 file changed, 21 insertions(+)
+
+--- a/arch/arm64/include/asm/barrier.h
++++ b/arch/arm64/include/asm/barrier.h
+@@ -40,6 +40,27 @@
+ #define dma_rmb()     dmb(oshld)
+ #define dma_wmb()     dmb(oshst)
++/*
++ * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz
++ * and 0 otherwise.
++ */
++#define array_index_mask_nospec array_index_mask_nospec
++static inline unsigned long array_index_mask_nospec(unsigned long idx,
++                                                  unsigned long sz)
++{
++      unsigned long mask;
++
++      asm volatile(
++      "       cmp     %1, %2\n"
++      "       sbc     %0, xzr, xzr\n"
++      : "=r" (mask)
++      : "r" (idx), "Ir" (sz)
++      : "cc");
++
++      csdb();
++      return mask;
++}
++
+ #define __smp_mb()    dmb(ish)
+ #define __smp_rmb()   dmb(ishld)
+ #define __smp_wmb()   dmb(ishst)
diff --git a/queue-4.9/arm64-implement-branch-predictor-hardening-for-affected-cortex-a-cpus.patch b/queue-4.9/arm64-implement-branch-predictor-hardening-for-affected-cortex-a-cpus.patch
new file mode 100644 (file)
index 0000000..a9a4a52
--- /dev/null
@@ -0,0 +1,135 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:21 +0100
+Subject: [PATCH v4.9.y 25/42] arm64: Implement branch predictor hardening for affected Cortex-A CPUs
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-26-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit aa6acde65e03186b5add8151e1ffe36c3c62639b upstream.
+
+Cortex-A57, A72, A73 and A75 are susceptible to branch predictor aliasing
+and can theoretically be attacked by malicious code.
+
+This patch implements a PSCI-based mitigation for these CPUs when available.
+The call into firmware will invalidate the branch predictor state, preventing
+any malicious entries from affecting other victim contexts.
+
+Co-developed-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kernel/bpi.S        |   24 +++++++++++++++++++++++
+ arch/arm64/kernel/cpu_errata.c |   42 +++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 66 insertions(+)
+
+--- a/arch/arm64/kernel/bpi.S
++++ b/arch/arm64/kernel/bpi.S
+@@ -53,3 +53,27 @@ ENTRY(__bp_harden_hyp_vecs_start)
+       vectors __kvm_hyp_vector
+       .endr
+ ENTRY(__bp_harden_hyp_vecs_end)
++ENTRY(__psci_hyp_bp_inval_start)
++      sub     sp, sp, #(8 * 18)
++      stp     x16, x17, [sp, #(16 * 0)]
++      stp     x14, x15, [sp, #(16 * 1)]
++      stp     x12, x13, [sp, #(16 * 2)]
++      stp     x10, x11, [sp, #(16 * 3)]
++      stp     x8, x9, [sp, #(16 * 4)]
++      stp     x6, x7, [sp, #(16 * 5)]
++      stp     x4, x5, [sp, #(16 * 6)]
++      stp     x2, x3, [sp, #(16 * 7)]
++      stp     x0, x1, [sp, #(16 * 8)]
++      mov     x0, #0x84000000
++      smc     #0
++      ldp     x16, x17, [sp, #(16 * 0)]
++      ldp     x14, x15, [sp, #(16 * 1)]
++      ldp     x12, x13, [sp, #(16 * 2)]
++      ldp     x10, x11, [sp, #(16 * 3)]
++      ldp     x8, x9, [sp, #(16 * 4)]
++      ldp     x6, x7, [sp, #(16 * 5)]
++      ldp     x4, x5, [sp, #(16 * 6)]
++      ldp     x2, x3, [sp, #(16 * 7)]
++      ldp     x0, x1, [sp, #(16 * 8)]
++      add     sp, sp, #(8 * 18)
++ENTRY(__psci_hyp_bp_inval_end)
+--- a/arch/arm64/kernel/cpu_errata.c
++++ b/arch/arm64/kernel/cpu_errata.c
+@@ -53,6 +53,8 @@ static int cpu_enable_trap_ctr_access(vo
+ DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
+ #ifdef CONFIG_KVM
++extern char __psci_hyp_bp_inval_start[], __psci_hyp_bp_inval_end[];
++
+ static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start,
+                               const char *hyp_vecs_end)
+ {
+@@ -94,6 +96,9 @@ static void __install_bp_hardening_cb(bp
+       spin_unlock(&bp_lock);
+ }
+ #else
++#define __psci_hyp_bp_inval_start     NULL
++#define __psci_hyp_bp_inval_end               NULL
++
+ static void __install_bp_hardening_cb(bp_hardening_cb_t fn,
+                                     const char *hyp_vecs_start,
+                                     const char *hyp_vecs_end)
+@@ -118,6 +123,21 @@ static void  install_bp_hardening_cb(con
+       __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end);
+ }
++
++#include <linux/psci.h>
++
++static int enable_psci_bp_hardening(void *data)
++{
++      const struct arm64_cpu_capabilities *entry = data;
++
++      if (psci_ops.get_version)
++              install_bp_hardening_cb(entry,
++                                     (bp_hardening_cb_t)psci_ops.get_version,
++                                     __psci_hyp_bp_inval_start,
++                                     __psci_hyp_bp_inval_end);
++
++      return 0;
++}
+ #endif        /* CONFIG_HARDEN_BRANCH_PREDICTOR */
+ #define MIDR_RANGE(model, min, max) \
+@@ -211,6 +231,28 @@ const struct arm64_cpu_capabilities arm6
+               .def_scope = SCOPE_LOCAL_CPU,
+               .enable = cpu_enable_trap_ctr_access,
+       },
++#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
++      {
++              .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
++              MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
++              .enable = enable_psci_bp_hardening,
++      },
++      {
++              .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
++              MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
++              .enable = enable_psci_bp_hardening,
++      },
++      {
++              .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
++              MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
++              .enable = enable_psci_bp_hardening,
++      },
++      {
++              .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
++              MIDR_ALL_VERSIONS(MIDR_CORTEX_A75),
++              .enable = enable_psci_bp_hardening,
++      },
++#endif
+       {
+       }
+ };
diff --git a/queue-4.9/arm64-kill-psci_get_version-as-a-variant-2-workaround.patch b/queue-4.9/arm64-kill-psci_get_version-as-a-variant-2-workaround.patch
new file mode 100644 (file)
index 0000000..40124d8
--- /dev/null
@@ -0,0 +1,214 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:38 +0100
+Subject: [PATCH v4.9.y 42/42] arm64: Kill PSCI_GET_VERSION as a variant-2 workaround
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-43-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 3a0a397ff5ff8b56ca9f7908b75dee6bf0b5fabb upstream.
+
+Now that we've standardised on SMCCC v1.1 to perform the branch
+prediction invalidation, let's drop the previous band-aid.
+If vendors haven't updated their firmware to do SMCCC 1.1, they
+haven't updated PSCI either, so we don't loose anything.
+
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kernel/bpi.S        |   24 ---------------------
+ arch/arm64/kernel/cpu_errata.c |   45 +++++++++++------------------------------
+ arch/arm64/kvm/hyp/switch.c    |   14 ------------
+ 3 files changed, 13 insertions(+), 70 deletions(-)
+
+--- a/arch/arm64/kernel/bpi.S
++++ b/arch/arm64/kernel/bpi.S
+@@ -54,30 +54,6 @@ ENTRY(__bp_harden_hyp_vecs_start)
+       vectors __kvm_hyp_vector
+       .endr
+ ENTRY(__bp_harden_hyp_vecs_end)
+-ENTRY(__psci_hyp_bp_inval_start)
+-      sub     sp, sp, #(8 * 18)
+-      stp     x16, x17, [sp, #(16 * 0)]
+-      stp     x14, x15, [sp, #(16 * 1)]
+-      stp     x12, x13, [sp, #(16 * 2)]
+-      stp     x10, x11, [sp, #(16 * 3)]
+-      stp     x8, x9, [sp, #(16 * 4)]
+-      stp     x6, x7, [sp, #(16 * 5)]
+-      stp     x4, x5, [sp, #(16 * 6)]
+-      stp     x2, x3, [sp, #(16 * 7)]
+-      stp     x0, x1, [sp, #(16 * 8)]
+-      mov     x0, #0x84000000
+-      smc     #0
+-      ldp     x16, x17, [sp, #(16 * 0)]
+-      ldp     x14, x15, [sp, #(16 * 1)]
+-      ldp     x12, x13, [sp, #(16 * 2)]
+-      ldp     x10, x11, [sp, #(16 * 3)]
+-      ldp     x8, x9, [sp, #(16 * 4)]
+-      ldp     x6, x7, [sp, #(16 * 5)]
+-      ldp     x4, x5, [sp, #(16 * 6)]
+-      ldp     x2, x3, [sp, #(16 * 7)]
+-      ldp     x0, x1, [sp, #(16 * 8)]
+-      add     sp, sp, #(8 * 18)
+-ENTRY(__psci_hyp_bp_inval_end)
+ .macro smccc_workaround_1 inst
+       sub     sp, sp, #(8 * 4)
+--- a/arch/arm64/kernel/cpu_errata.c
++++ b/arch/arm64/kernel/cpu_errata.c
+@@ -53,7 +53,6 @@ static int cpu_enable_trap_ctr_access(vo
+ DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);
+ #ifdef CONFIG_KVM
+-extern char __psci_hyp_bp_inval_start[], __psci_hyp_bp_inval_end[];
+ extern char __smccc_workaround_1_smc_start[];
+ extern char __smccc_workaround_1_smc_end[];
+ extern char __smccc_workaround_1_hvc_start[];
+@@ -100,8 +99,6 @@ static void __install_bp_hardening_cb(bp
+       spin_unlock(&bp_lock);
+ }
+ #else
+-#define __psci_hyp_bp_inval_start             NULL
+-#define __psci_hyp_bp_inval_end                       NULL
+ #define __smccc_workaround_1_smc_start                NULL
+ #define __smccc_workaround_1_smc_end          NULL
+ #define __smccc_workaround_1_hvc_start                NULL
+@@ -146,24 +143,25 @@ static void call_hvc_arch_workaround_1(v
+       arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
+ }
+-static bool check_smccc_arch_workaround_1(const struct arm64_cpu_capabilities *entry)
++static int enable_smccc_arch_workaround_1(void *data)
+ {
++      const struct arm64_cpu_capabilities *entry = data;
+       bp_hardening_cb_t cb;
+       void *smccc_start, *smccc_end;
+       struct arm_smccc_res res;
+       if (!entry->matches(entry, SCOPE_LOCAL_CPU))
+-              return false;
++              return 0;
+       if (psci_ops.smccc_version == SMCCC_VERSION_1_0)
+-              return false;
++              return 0;
+       switch (psci_ops.conduit) {
+       case PSCI_CONDUIT_HVC:
+               arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+                                 ARM_SMCCC_ARCH_WORKAROUND_1, &res);
+               if (res.a0)
+-                      return false;
++                      return 0;
+               cb = call_hvc_arch_workaround_1;
+               smccc_start = __smccc_workaround_1_hvc_start;
+               smccc_end = __smccc_workaround_1_hvc_end;
+@@ -173,35 +171,18 @@ static bool check_smccc_arch_workaround_
+               arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
+                                 ARM_SMCCC_ARCH_WORKAROUND_1, &res);
+               if (res.a0)
+-                      return false;
++                      return 0;
+               cb = call_smc_arch_workaround_1;
+               smccc_start = __smccc_workaround_1_smc_start;
+               smccc_end = __smccc_workaround_1_smc_end;
+               break;
+       default:
+-              return false;
++              return 0;
+       }
+       install_bp_hardening_cb(entry, cb, smccc_start, smccc_end);
+-      return true;
+-}
+-
+-static int enable_psci_bp_hardening(void *data)
+-{
+-      const struct arm64_cpu_capabilities *entry = data;
+-
+-      if (psci_ops.get_version) {
+-              if (check_smccc_arch_workaround_1(entry))
+-                      return 0;
+-
+-              install_bp_hardening_cb(entry,
+-                                     (bp_hardening_cb_t)psci_ops.get_version,
+-                                     __psci_hyp_bp_inval_start,
+-                                     __psci_hyp_bp_inval_end);
+-      }
+-
+       return 0;
+ }
+ #endif        /* CONFIG_HARDEN_BRANCH_PREDICTOR */
+@@ -301,32 +282,32 @@ const struct arm64_cpu_capabilities arm6
+       {
+               .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+               MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
+-              .enable = enable_psci_bp_hardening,
++              .enable = enable_smccc_arch_workaround_1,
+       },
+       {
+               .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+               MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
+-              .enable = enable_psci_bp_hardening,
++              .enable = enable_smccc_arch_workaround_1,
+       },
+       {
+               .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+               MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
+-              .enable = enable_psci_bp_hardening,
++              .enable = enable_smccc_arch_workaround_1,
+       },
+       {
+               .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+               MIDR_ALL_VERSIONS(MIDR_CORTEX_A75),
+-              .enable = enable_psci_bp_hardening,
++              .enable = enable_smccc_arch_workaround_1,
+       },
+       {
+               .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+               MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
+-              .enable = enable_psci_bp_hardening,
++              .enable = enable_smccc_arch_workaround_1,
+       },
+       {
+               .capability = ARM64_HARDEN_BRANCH_PREDICTOR,
+               MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
+-              .enable = enable_psci_bp_hardening,
++              .enable = enable_smccc_arch_workaround_1,
+       },
+ #endif
+       {
+--- a/arch/arm64/kvm/hyp/switch.c
++++ b/arch/arm64/kvm/hyp/switch.c
+@@ -311,20 +311,6 @@ again:
+       if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu))
+               goto again;
+-      if (exit_code == ARM_EXCEPTION_TRAP &&
+-          (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_HVC64 ||
+-           kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_HVC32)) {
+-              u32 val = vcpu_get_reg(vcpu, 0);
+-
+-              if (val == PSCI_0_2_FN_PSCI_VERSION) {
+-                      val = kvm_psci_version(vcpu, kern_hyp_va(vcpu->kvm));
+-                      if (unlikely(val == KVM_ARM_PSCI_0_1))
+-                              val = PSCI_RET_NOT_SUPPORTED;
+-                      vcpu_set_reg(vcpu, 0, val);
+-                      goto again;
+-              }
+-      }
+-
+       if (static_branch_unlikely(&vgic_v2_cpuif_trap) &&
+           exit_code == ARM_EXCEPTION_TRAP) {
+               bool valid;
diff --git a/queue-4.9/arm64-kvm-add-smccc_arch_workaround_1-fast-handling.patch b/queue-4.9/arm64-kvm-add-smccc_arch_workaround_1-fast-handling.patch
new file mode 100644 (file)
index 0000000..064f736
--- /dev/null
@@ -0,0 +1,76 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:32 +0100
+Subject: [PATCH v4.9.y 36/42] arm64: KVM: Add SMCCC_ARCH_WORKAROUND_1 fast handling
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-37-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit f72af90c3783d924337624659b43e2d36f1b36b4 upstream.
+
+We want SMCCC_ARCH_WORKAROUND_1 to be fast. As fast as possible.
+So let's intercept it as early as we can by testing for the
+function call number as soon as we've identified a HVC call
+coming from the guest.
+
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/hyp/hyp-entry.S |   20 ++++++++++++++++++--
+ 1 file changed, 18 insertions(+), 2 deletions(-)
+
+--- a/arch/arm64/kvm/hyp/hyp-entry.S
++++ b/arch/arm64/kvm/hyp/hyp-entry.S
+@@ -15,6 +15,7 @@
+  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  */
++#include <linux/arm-smccc.h>
+ #include <linux/linkage.h>
+ #include <asm/alternative.h>
+@@ -79,10 +80,11 @@ alternative_endif
+       lsr     x0, x1, #ESR_ELx_EC_SHIFT
+       cmp     x0, #ESR_ELx_EC_HVC64
++      ccmp    x0, #ESR_ELx_EC_HVC32, #4, ne
+       b.ne    el1_trap
+-      mrs     x1, vttbr_el2           // If vttbr is valid, the 64bit guest
+-      cbnz    x1, el1_trap            // called HVC
++      mrs     x1, vttbr_el2           // If vttbr is valid, the guest
++      cbnz    x1, el1_hvc_guest       // called HVC
+       /* Here, we're pretty sure the host called HVC. */
+       ldp     x0, x1, [sp], #16
+@@ -101,6 +103,20 @@ alternative_endif
+ 2:    eret
++el1_hvc_guest:
++      /*
++       * Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1.
++       * The workaround has already been applied on the host,
++       * so let's quickly get back to the guest. We don't bother
++       * restoring x1, as it can be clobbered anyway.
++       */
++      ldr     x1, [sp]                                // Guest's x0
++      eor     w1, w1, #ARM_SMCCC_ARCH_WORKAROUND_1
++      cbnz    w1, el1_trap
++      mov     x0, x1
++      add     sp, sp, #16
++      eret
++
+ el1_trap:
+       /*
+        * x0: ESR_EC
diff --git a/queue-4.9/arm64-kvm-increment-pc-after-handling-an-smc-trap.patch b/queue-4.9/arm64-kvm-increment-pc-after-handling-an-smc-trap.patch
new file mode 100644 (file)
index 0000000..c3149c6
--- /dev/null
@@ -0,0 +1,54 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:23 +0100
+Subject: [PATCH v4.9.y 27/42] arm64: KVM: Increment PC after handling an SMC trap
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-28-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit f5115e8869e1dfafac0e414b4f1664f3a84a4683 upstream.
+
+When handling an SMC trap, the "preferred return address" is set
+to that of the SMC, and not the next PC (which is a departure from
+the behaviour of an SMC that isn't trapped).
+
+Increment PC in the handler, as the guest is otherwise forever
+stuck...
+
+Cc: stable@vger.kernel.org
+Fixes: acfb3b883f6d ("arm64: KVM: Fix SMCCC handling of unimplemented SMC/HVC calls")
+Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/handle_exit.c |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/arch/arm64/kvm/handle_exit.c
++++ b/arch/arm64/kvm/handle_exit.c
+@@ -53,7 +53,16 @@ static int handle_hvc(struct kvm_vcpu *v
+ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
+ {
++      /*
++       * "If an SMC instruction executed at Non-secure EL1 is
++       * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a
++       * Trap exception, not a Secure Monitor Call exception [...]"
++       *
++       * We need to advance the PC after the trap, as it would
++       * otherwise return to the same address...
++       */
+       vcpu_set_reg(vcpu, 0, ~0UL);
++      kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+       return 1;
+ }
diff --git a/queue-4.9/arm64-kvm-make-psci_version-a-fast-path.patch b/queue-4.9/arm64-kvm-make-psci_version-a-fast-path.patch
new file mode 100644 (file)
index 0000000..46bd64a
--- /dev/null
@@ -0,0 +1,58 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:29 +0100
+Subject: [PATCH v4.9.y 33/42] arm64: KVM: Make PSCI_VERSION a fast path
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-34-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 90348689d500410ca7a55624c667f956771dce7f upstream.
+
+For those CPUs that require PSCI to perform a BP invalidation,
+going all the way to the PSCI code for not much is a waste of
+precious cycles. Let's terminate that call as early as possible.
+
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kvm/hyp/switch.c |   13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+--- a/arch/arm64/kvm/hyp/switch.c
++++ b/arch/arm64/kvm/hyp/switch.c
+@@ -17,6 +17,7 @@
+ #include <linux/types.h>
+ #include <linux/jump_label.h>
++#include <uapi/linux/psci.h>
+ #include <asm/kvm_asm.h>
+ #include <asm/kvm_emulate.h>
+@@ -308,6 +309,18 @@ again:
+       if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu))
+               goto again;
++      if (exit_code == ARM_EXCEPTION_TRAP &&
++          (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_HVC64 ||
++           kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_HVC32) &&
++          vcpu_get_reg(vcpu, 0) == PSCI_0_2_FN_PSCI_VERSION) {
++              u64 val = PSCI_RET_NOT_SUPPORTED;
++              if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features))
++                      val = 2;
++
++              vcpu_set_reg(vcpu, 0, val);
++              goto again;
++      }
++
+       if (static_branch_unlikely(&vgic_v2_cpuif_trap) &&
+           exit_code == ARM_EXCEPTION_TRAP) {
+               bool valid;
diff --git a/queue-4.9/arm64-kvm-report-smccc_arch_workaround_1-bp-hardening-support.patch b/queue-4.9/arm64-kvm-report-smccc_arch_workaround_1-bp-hardening-support.patch
new file mode 100644 (file)
index 0000000..822c40b
--- /dev/null
@@ -0,0 +1,101 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:31 +0100
+Subject: [PATCH v4.9.y 35/42] arm64: KVM: Report SMCCC_ARCH_WORKAROUND_1 BP hardening support
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-36-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 6167ec5c9145cdf493722dfd80a5d48bafc4a18a upstream.
+
+A new feature of SMCCC 1.1 is that it offers firmware-based CPU
+workarounds. In particular, SMCCC_ARCH_WORKAROUND_1 provides
+BP hardening for CVE-2017-5715.
+
+If the host has some mitigation for this issue, report that
+we deal with it using SMCCC_ARCH_WORKAROUND_1, as we apply the
+host workaround on every guest exit.
+
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Reviewed-by: Christoffer Dall <christoffer.dall@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+[v4.9: account for files moved to virt/ upstream]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/include/asm/kvm_host.h   |    6 ++++++
+ arch/arm/kvm/psci.c               |    9 ++++++++-
+ arch/arm64/include/asm/kvm_host.h |    5 +++++
+ include/linux/arm-smccc.h         |    5 +++++
+ 4 files changed, 24 insertions(+), 1 deletion(-)
+
+--- a/arch/arm/include/asm/kvm_host.h
++++ b/arch/arm/include/asm/kvm_host.h
+@@ -318,4 +318,10 @@ static inline int kvm_arm_vcpu_arch_has_
+       return -ENXIO;
+ }
++static inline bool kvm_arm_harden_branch_predictor(void)
++{
++      /* No way to detect it yet, pretend it is not there. */
++      return false;
++}
++
+ #endif /* __ARM_KVM_HOST_H__ */
+--- a/arch/arm/kvm/psci.c
++++ b/arch/arm/kvm/psci.c
+@@ -403,13 +403,20 @@ int kvm_hvc_call_handler(struct kvm_vcpu
+ {
+       u32 func_id = smccc_get_function(vcpu);
+       u32 val = PSCI_RET_NOT_SUPPORTED;
++      u32 feature;
+       switch (func_id) {
+       case ARM_SMCCC_VERSION_FUNC_ID:
+               val = ARM_SMCCC_VERSION_1_1;
+               break;
+       case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
+-              /* Nothing supported yet */
++              feature = smccc_get_arg1(vcpu);
++              switch(feature) {
++              case ARM_SMCCC_ARCH_WORKAROUND_1:
++                      if (kvm_arm_harden_branch_predictor())
++                              val = 0;
++                      break;
++              }
+               break;
+       default:
+               return kvm_psci_call(vcpu);
+--- a/arch/arm64/include/asm/kvm_host.h
++++ b/arch/arm64/include/asm/kvm_host.h
+@@ -393,4 +393,9 @@ static inline void __cpu_init_stage2(voi
+                 "PARange is %d bits, unsupported configuration!", parange);
+ }
++static inline bool kvm_arm_harden_branch_predictor(void)
++{
++      return cpus_have_cap(ARM64_HARDEN_BRANCH_PREDICTOR);
++}
++
+ #endif /* __ARM64_KVM_HOST_H__ */
+--- a/include/linux/arm-smccc.h
++++ b/include/linux/arm-smccc.h
+@@ -73,6 +73,11 @@
+                          ARM_SMCCC_SMC_32,                            \
+                          0, 1)
++#define ARM_SMCCC_ARCH_WORKAROUND_1                                   \
++      ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                         \
++                         ARM_SMCCC_SMC_32,                            \
++                         0, 0x8000)
++
+ #ifndef __ASSEMBLY__
+ #include <linux/linkage.h>
diff --git a/queue-4.9/arm64-kvm-use-per-cpu-vector-when-bp-hardening-is-enabled.patch b/queue-4.9/arm64-kvm-use-per-cpu-vector-when-bp-hardening-is-enabled.patch
new file mode 100644 (file)
index 0000000..dec3102
--- /dev/null
@@ -0,0 +1,132 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:16 +0100
+Subject: [PATCH v4.9.y 20/42] arm64: KVM: Use per-CPU vector when BP hardening is enabled
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-21-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 6840bdd73d07216ab4bc46f5a8768c37ea519038 upstream.
+
+Now that we have per-CPU vectors, let's plug then in the KVM/arm64 code.
+
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+[v4.9: account for files moved to virt/ upstream, use cpus_have_cap()]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm/include/asm/kvm_mmu.h   |   10 ++++++++++
+ arch/arm/kvm/arm.c               |    9 ++++++++-
+ arch/arm64/include/asm/kvm_mmu.h |   38 ++++++++++++++++++++++++++++++++++++++
+ arch/arm64/kvm/hyp/switch.c      |    2 +-
+ 4 files changed, 57 insertions(+), 2 deletions(-)
+
+--- a/arch/arm/include/asm/kvm_mmu.h
++++ b/arch/arm/include/asm/kvm_mmu.h
+@@ -223,6 +223,16 @@ static inline unsigned int kvm_get_vmid_
+       return 8;
+ }
++static inline void *kvm_get_hyp_vector(void)
++{
++      return kvm_ksym_ref(__kvm_hyp_vector);
++}
++
++static inline int kvm_map_vectors(void)
++{
++      return 0;
++}
++
+ #endif        /* !__ASSEMBLY__ */
+ #endif /* __ARM_KVM_MMU_H__ */
+--- a/arch/arm/kvm/arm.c
++++ b/arch/arm/kvm/arm.c
+@@ -1088,7 +1088,7 @@ static void cpu_init_hyp_mode(void *dumm
+       pgd_ptr = kvm_mmu_get_httbr();
+       stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
+       hyp_stack_ptr = stack_page + PAGE_SIZE;
+-      vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
++      vector_ptr = (unsigned long)kvm_get_hyp_vector();
+       __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
+       __cpu_init_stage2();
+@@ -1345,6 +1345,13 @@ static int init_hyp_mode(void)
+               goto out_err;
+       }
++
++      err = kvm_map_vectors();
++      if (err) {
++              kvm_err("Cannot map vectors\n");
++              goto out_err;
++      }
++
+       /*
+        * Map the Hyp stack pages
+        */
+--- a/arch/arm64/include/asm/kvm_mmu.h
++++ b/arch/arm64/include/asm/kvm_mmu.h
+@@ -313,5 +313,43 @@ static inline unsigned int kvm_get_vmid_
+       return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8;
+ }
++#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR
++#include <asm/mmu.h>
++
++static inline void *kvm_get_hyp_vector(void)
++{
++      struct bp_hardening_data *data = arm64_get_bp_hardening_data();
++      void *vect = kvm_ksym_ref(__kvm_hyp_vector);
++
++      if (data->fn) {
++              vect = __bp_harden_hyp_vecs_start +
++                     data->hyp_vectors_slot * SZ_2K;
++
++              if (!cpus_have_cap(ARM64_HAS_VIRT_HOST_EXTN))
++                      vect = lm_alias(vect);
++      }
++
++      return vect;
++}
++
++static inline int kvm_map_vectors(void)
++{
++      return create_hyp_mappings(kvm_ksym_ref(__bp_harden_hyp_vecs_start),
++                                 kvm_ksym_ref(__bp_harden_hyp_vecs_end),
++                                 PAGE_HYP_EXEC);
++}
++
++#else
++static inline void *kvm_get_hyp_vector(void)
++{
++      return kvm_ksym_ref(__kvm_hyp_vector);
++}
++
++static inline int kvm_map_vectors(void)
++{
++      return 0;
++}
++#endif
++
+ #endif /* __ASSEMBLY__ */
+ #endif /* __ARM64_KVM_MMU_H__ */
+--- a/arch/arm64/kvm/hyp/switch.c
++++ b/arch/arm64/kvm/hyp/switch.c
+@@ -50,7 +50,7 @@ static void __hyp_text __activate_traps_
+       val &= ~CPACR_EL1_FPEN;
+       write_sysreg(val, cpacr_el1);
+-      write_sysreg(__kvm_hyp_vector, vbar_el1);
++      write_sysreg(kvm_get_hyp_vector(), vbar_el1);
+ }
+ static void __hyp_text __activate_traps_nvhe(void)
diff --git a/queue-4.9/arm64-make-user_ds-an-inclusive-limit.patch b/queue-4.9/arm64-make-user_ds-an-inclusive-limit.patch
new file mode 100644 (file)
index 0000000..3a8f88d
--- /dev/null
@@ -0,0 +1,157 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:00 +0100
+Subject: [PATCH v4.9.y 04/42] arm64: Make USER_DS an inclusive limit
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-5-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Robin Murphy <robin.murphy@arm.com>
+
+commit 51369e398d0d33e8f524314e672b07e8cf870e79 upstream.
+
+Currently, USER_DS represents an exclusive limit while KERNEL_DS is
+inclusive. In order to do some clever trickery for speculation-safe
+masking, we need them both to behave equivalently - there aren't enough
+bits to make KERNEL_DS exclusive, so we have precisely one option. This
+also happens to correct a longstanding false negative for a range
+ending on the very top byte of kernel memory.
+
+Mark Rutland points out that we've actually got the semantics of
+addresses vs. segments muddled up in most of the places we need to
+amend, so shuffle the {USER,KERNEL}_DS definitions around such that we
+can correct those properly instead of just pasting "-1"s everywhere.
+
+Signed-off-by: Robin Murphy <robin.murphy@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+[v4.9: avoid dependence on TTBR0 SW PAN and THREAD_INFO_IN_TASK_STRUCT]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/processor.h |    3 ++
+ arch/arm64/include/asm/uaccess.h   |   46 +++++++++++++++++++++----------------
+ arch/arm64/kernel/entry.S          |    4 +--
+ arch/arm64/mm/fault.c              |    2 -
+ 4 files changed, 33 insertions(+), 22 deletions(-)
+
+--- a/arch/arm64/include/asm/processor.h
++++ b/arch/arm64/include/asm/processor.h
+@@ -21,6 +21,9 @@
+ #define TASK_SIZE_64          (UL(1) << VA_BITS)
++#define KERNEL_DS     UL(-1)
++#define USER_DS               (TASK_SIZE_64 - 1)
++
+ #ifndef __ASSEMBLY__
+ /*
+--- a/arch/arm64/include/asm/uaccess.h
++++ b/arch/arm64/include/asm/uaccess.h
+@@ -28,6 +28,7 @@
+ #include <asm/alternative.h>
+ #include <asm/cpufeature.h>
++#include <asm/processor.h>
+ #include <asm/ptrace.h>
+ #include <asm/sysreg.h>
+ #include <asm/errno.h>
+@@ -59,10 +60,7 @@ struct exception_table_entry
+ extern int fixup_exception(struct pt_regs *regs);
+-#define KERNEL_DS     (-1UL)
+ #define get_ds()      (KERNEL_DS)
+-
+-#define USER_DS               TASK_SIZE_64
+ #define get_fs()      (current_thread_info()->addr_limit)
+ static inline void set_fs(mm_segment_t fs)
+@@ -87,22 +85,32 @@ static inline void set_fs(mm_segment_t f
+  * Returns 1 if the range is valid, 0 otherwise.
+  *
+  * This is equivalent to the following test:
+- * (u65)addr + (u65)size <= current->addr_limit
+- *
+- * This needs 65-bit arithmetic.
++ * (u65)addr + (u65)size <= (u65)current->addr_limit + 1
+  */
+-#define __range_ok(addr, size)                                                \
+-({                                                                    \
+-      unsigned long __addr = (unsigned long __force)(addr);           \
+-      unsigned long flag, roksum;                                     \
+-      __chk_user_ptr(addr);                                           \
+-      asm("adds %1, %1, %3; ccmp %1, %4, #2, cc; cset %0, ls"         \
+-              : "=&r" (flag), "=&r" (roksum)                          \
+-              : "1" (__addr), "Ir" (size),                            \
+-                "r" (current_thread_info()->addr_limit)               \
+-              : "cc");                                                \
+-      flag;                                                           \
+-})
++static inline unsigned long __range_ok(unsigned long addr, unsigned long size)
++{
++      unsigned long limit = current_thread_info()->addr_limit;
++
++      __chk_user_ptr(addr);
++      asm volatile(
++      // A + B <= C + 1 for all A,B,C, in four easy steps:
++      // 1: X = A + B; X' = X % 2^64
++      "       adds    %0, %0, %2\n"
++      // 2: Set C = 0 if X > 2^64, to guarantee X' > C in step 4
++      "       csel    %1, xzr, %1, hi\n"
++      // 3: Set X' = ~0 if X >= 2^64. For X == 2^64, this decrements X'
++      //    to compensate for the carry flag being set in step 4. For
++      //    X > 2^64, X' merely has to remain nonzero, which it does.
++      "       csinv   %0, %0, xzr, cc\n"
++      // 4: For X < 2^64, this gives us X' - C - 1 <= 0, where the -1
++      //    comes from the carry in being clear. Otherwise, we are
++      //    testing X' - C == 0, subject to the previous adjustments.
++      "       sbcs    xzr, %0, %1\n"
++      "       cset    %0, ls\n"
++      : "+r" (addr), "+r" (limit) : "Ir" (size) : "cc");
++
++      return addr;
++}
+ /*
+  * When dealing with data aborts, watchpoints, or instruction traps we may end
+@@ -111,7 +119,7 @@ static inline void set_fs(mm_segment_t f
+  */
+ #define untagged_addr(addr)           sign_extend64(addr, 55)
+-#define access_ok(type, addr, size)   __range_ok(addr, size)
++#define access_ok(type, addr, size)   __range_ok((unsigned long)(addr), size)
+ #define user_addr_max                 get_fs
+ #define _ASM_EXTABLE(from, to)                                                \
+--- a/arch/arm64/kernel/entry.S
++++ b/arch/arm64/kernel/entry.S
+@@ -126,10 +126,10 @@ alternative_else_nop_endif
+       .else
+       add     x21, sp, #S_FRAME_SIZE
+       get_thread_info tsk
+-      /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */
++      /* Save the task's original addr_limit and set USER_DS */
+       ldr     x20, [tsk, #TI_ADDR_LIMIT]
+       str     x20, [sp, #S_ORIG_ADDR_LIMIT]
+-      mov     x20, #TASK_SIZE_64
++      mov     x20, #USER_DS
+       str     x20, [tsk, #TI_ADDR_LIMIT]
+       /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */
+       .endif /* \el == 0 */
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -332,7 +332,7 @@ static int __kprobes do_page_fault(unsig
+               mm_flags |= FAULT_FLAG_WRITE;
+       }
+-      if (is_permission_fault(esr) && (addr < USER_DS)) {
++      if (is_permission_fault(esr) && (addr < TASK_SIZE)) {
+               /* regs->orig_addr_limit may be 0 if we entered from EL0 */
+               if (regs->orig_addr_limit == KERNEL_DS)
+                       die("Accessing user space memory with fs=KERNEL_DS", regs, esr);
diff --git a/queue-4.9/arm64-move-bp-hardening-to-check_and_switch_context.patch b/queue-4.9/arm64-move-bp-hardening-to-check_and_switch_context.patch
new file mode 100644 (file)
index 0000000..cb53b85
--- /dev/null
@@ -0,0 +1,59 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:14 +0100
+Subject: [PATCH v4.9.y 18/42] arm64: Move BP hardening to check_and_switch_context
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-19-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit a8e4c0a919ae310944ed2c9ace11cf3ccd8a609b upstream.
+
+We call arm64_apply_bp_hardening() from post_ttbr_update_workaround,
+which has the unexpected consequence of being triggered on every
+exception return to userspace when ARM64_SW_TTBR0_PAN is selected,
+even if no context switch actually occured.
+
+This is a bit suboptimal, and it would be more logical to only
+invalidate the branch predictor when we actually switch to
+a different mm.
+
+In order to solve this, move the call to arm64_apply_bp_hardening()
+into check_and_switch_context(), where we're guaranteed to pick
+a different mm context.
+
+Acked-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/mm/context.c |    5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/arch/arm64/mm/context.c
++++ b/arch/arm64/mm/context.c
+@@ -230,6 +230,9 @@ void check_and_switch_context(struct mm_
+       raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
+ switch_mm_fastpath:
++
++      arm64_apply_bp_hardening();
++
+       cpu_switch_mm(mm->pgd, mm);
+ }
+@@ -240,8 +243,6 @@ asmlinkage void post_ttbr_update_workaro
+                       "ic iallu; dsb nsh; isb",
+                       ARM64_WORKAROUND_CAVIUM_27456,
+                       CONFIG_CAVIUM_ERRATUM_27456));
+-
+-      arm64_apply_bp_hardening();
+ }
+ static int asids_init(void)
diff --git a/queue-4.9/arm64-move-post_ttbr_update_workaround-to-c-code.patch b/queue-4.9/arm64-move-post_ttbr_update_workaround-to-c-code.patch
new file mode 100644 (file)
index 0000000..835ad88
--- /dev/null
@@ -0,0 +1,81 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:12 +0100
+Subject: [PATCH v4.9.y 16/42] arm64: Move post_ttbr_update_workaround to C code
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-17-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 95e3de3590e3f2358bb13f013911bc1bfa5d3f53 upstream.
+
+We will soon need to invoke a CPU-specific function pointer after changing
+page tables, so move post_ttbr_update_workaround out into C code to make
+this possible.
+
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/assembler.h |   13 -------------
+ arch/arm64/mm/context.c            |    9 +++++++++
+ arch/arm64/mm/proc.S               |    3 +--
+ 3 files changed, 10 insertions(+), 15 deletions(-)
+
+--- a/arch/arm64/include/asm/assembler.h
++++ b/arch/arm64/include/asm/assembler.h
+@@ -435,17 +435,4 @@ alternative_endif
+       and     \phys, \pte, #(((1 << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT)
+       .endm
+-/*
+- * Errata workaround post TTBR0_EL1 update.
+- */
+-      .macro  post_ttbr0_update_workaround
+-#ifdef CONFIG_CAVIUM_ERRATUM_27456
+-alternative_if ARM64_WORKAROUND_CAVIUM_27456
+-      ic      iallu
+-      dsb     nsh
+-      isb
+-alternative_else_nop_endif
+-#endif
+-      .endm
+-
+ #endif        /* __ASM_ASSEMBLER_H */
+--- a/arch/arm64/mm/context.c
++++ b/arch/arm64/mm/context.c
+@@ -233,6 +233,15 @@ switch_mm_fastpath:
+       cpu_switch_mm(mm->pgd, mm);
+ }
++/* Errata workaround post TTBRx_EL1 update. */
++asmlinkage void post_ttbr_update_workaround(void)
++{
++      asm(ALTERNATIVE("nop; nop; nop",
++                      "ic iallu; dsb nsh; isb",
++                      ARM64_WORKAROUND_CAVIUM_27456,
++                      CONFIG_CAVIUM_ERRATUM_27456));
++}
++
+ static int asids_init(void)
+ {
+       asid_bits = get_cpu_asid_bits();
+--- a/arch/arm64/mm/proc.S
++++ b/arch/arm64/mm/proc.S
+@@ -139,8 +139,7 @@ ENTRY(cpu_do_switch_mm)
+       isb
+       msr     ttbr0_el1, x0                   // now update TTBR0
+       isb
+-      post_ttbr0_update_workaround
+-      ret
++      b       post_ttbr_update_workaround     // Back to C code...
+ ENDPROC(cpu_do_switch_mm)
+       .pushsection ".idmap.text", "awx"
diff --git a/queue-4.9/arm64-move-task_-definitions-to-asm-processor.h.patch b/queue-4.9/arm64-move-task_-definitions-to-asm-processor.h.patch
new file mode 100644 (file)
index 0000000..f313be3
--- /dev/null
@@ -0,0 +1,128 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:10:59 +0100
+Subject: [PATCH v4.9.y 03/42] arm64: move TASK_* definitions to <asm/processor.h>
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-4-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Yury Norov <ynorov@caviumnetworks.com>
+
+commit eef94a3d09aab437c8c254de942d8b1aa76455e2 upstream.
+
+ILP32 series [1] introduces the dependency on <asm/is_compat.h> for
+TASK_SIZE macro. Which in turn requires <asm/thread_info.h>, and
+<asm/thread_info.h> include <asm/memory.h>, giving a circular dependency,
+because TASK_SIZE is currently located in <asm/memory.h>.
+
+In other architectures, TASK_SIZE is defined in <asm/processor.h>, and
+moving TASK_SIZE there fixes the problem.
+
+Discussion: https://patchwork.kernel.org/patch/9929107/
+
+[1] https://github.com/norov/linux/tree/ilp32-next
+
+CC: Will Deacon <will.deacon@arm.com>
+CC: Laura Abbott <labbott@redhat.com>
+Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: James Morse <james.morse@arm.com>
+Suggested-by: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Yury Norov <ynorov@caviumnetworks.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+[v4.9: necessary for making USER_DS an inclusive limit]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/memory.h    |   15 ---------------
+ arch/arm64/include/asm/processor.h |   21 +++++++++++++++++++++
+ arch/arm64/kernel/entry.S          |    1 +
+ 3 files changed, 22 insertions(+), 15 deletions(-)
+
+--- a/arch/arm64/include/asm/memory.h
++++ b/arch/arm64/include/asm/memory.h
+@@ -60,8 +60,6 @@
+  * KIMAGE_VADDR - the virtual address of the start of the kernel image
+  * VA_BITS - the maximum number of bits for virtual addresses.
+  * VA_START - the first kernel virtual address.
+- * TASK_SIZE - the maximum size of a user space task.
+- * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area.
+  */
+ #define VA_BITS                       (CONFIG_ARM64_VA_BITS)
+ #define VA_START              (UL(0xffffffffffffffff) - \
+@@ -76,19 +74,6 @@
+ #define PCI_IO_END            (VMEMMAP_START - SZ_2M)
+ #define PCI_IO_START          (PCI_IO_END - PCI_IO_SIZE)
+ #define FIXADDR_TOP           (PCI_IO_START - SZ_2M)
+-#define TASK_SIZE_64          (UL(1) << VA_BITS)
+-
+-#ifdef CONFIG_COMPAT
+-#define TASK_SIZE_32          UL(0x100000000)
+-#define TASK_SIZE             (test_thread_flag(TIF_32BIT) ? \
+-                              TASK_SIZE_32 : TASK_SIZE_64)
+-#define TASK_SIZE_OF(tsk)     (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
+-                              TASK_SIZE_32 : TASK_SIZE_64)
+-#else
+-#define TASK_SIZE             TASK_SIZE_64
+-#endif /* CONFIG_COMPAT */
+-
+-#define TASK_UNMAPPED_BASE    (PAGE_ALIGN(TASK_SIZE / 4))
+ #define KERNEL_START      _text
+ #define KERNEL_END        _end
+--- a/arch/arm64/include/asm/processor.h
++++ b/arch/arm64/include/asm/processor.h
+@@ -19,6 +19,10 @@
+ #ifndef __ASM_PROCESSOR_H
+ #define __ASM_PROCESSOR_H
++#define TASK_SIZE_64          (UL(1) << VA_BITS)
++
++#ifndef __ASSEMBLY__
++
+ /*
+  * Default implementation of macro that returns current
+  * instruction pointer ("program counter").
+@@ -37,6 +41,22 @@
+ #include <asm/ptrace.h>
+ #include <asm/types.h>
++/*
++ * TASK_SIZE - the maximum size of a user space task.
++ * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area.
++ */
++#ifdef CONFIG_COMPAT
++#define TASK_SIZE_32          UL(0x100000000)
++#define TASK_SIZE             (test_thread_flag(TIF_32BIT) ? \
++                              TASK_SIZE_32 : TASK_SIZE_64)
++#define TASK_SIZE_OF(tsk)     (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
++                              TASK_SIZE_32 : TASK_SIZE_64)
++#else
++#define TASK_SIZE             TASK_SIZE_64
++#endif /* CONFIG_COMPAT */
++
++#define TASK_UNMAPPED_BASE    (PAGE_ALIGN(TASK_SIZE / 4))
++
+ #define STACK_TOP_MAX         TASK_SIZE_64
+ #ifdef CONFIG_COMPAT
+ #define AARCH32_VECTORS_BASE  0xffff0000
+@@ -192,4 +212,5 @@ int cpu_enable_pan(void *__unused);
+ int cpu_enable_uao(void *__unused);
+ int cpu_enable_cache_maint_trap(void *__unused);
++#endif /* __ASSEMBLY__ */
+ #endif /* __ASM_PROCESSOR_H */
+--- a/arch/arm64/kernel/entry.S
++++ b/arch/arm64/kernel/entry.S
+@@ -30,6 +30,7 @@
+ #include <asm/irq.h>
+ #include <asm/memory.h>
+ #include <asm/mmu.h>
++#include <asm/processor.h>
+ #include <asm/thread_info.h>
+ #include <asm/asm-uaccess.h>
+ #include <asm/unistd.h>
diff --git a/queue-4.9/arm64-run-enable-method-for-errata-work-arounds-on-late-cpus.patch b/queue-4.9/arm64-run-enable-method-for-errata-work-arounds-on-late-cpus.patch
new file mode 100644 (file)
index 0000000..6bc2cec
--- /dev/null
@@ -0,0 +1,62 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:08 +0100
+Subject: [PATCH v4.9.y 12/42] arm64: Run enable method for errata work arounds on late CPUs
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-13-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Suzuki K Poulose <suzuki.poulose@arm.com>
+
+commit 55b35d070c2534dfb714b883f3c3ae05d02032da upstream.
+
+When a CPU is brought up after we have finalised the system
+wide capabilities (i.e, features and errata), we make sure the
+new CPU doesn't need a new errata work around which has not been
+detected already. However we don't run enable() method on the new
+CPU for the errata work arounds already detected. This could
+cause the new CPU running without potential work arounds.
+It is upto the "enable()" method to decide if this CPU should
+do something about the errata.
+
+Fixes: commit 6a6efbb45b7d95c84 ("arm64: Verify CPU errata work arounds on hotplugged CPU")
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Andre Przywara <andre.przywara@arm.com>
+Cc: Dave Martin <dave.martin@arm.com>
+Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/kernel/cpu_errata.c |    9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/arch/arm64/kernel/cpu_errata.c
++++ b/arch/arm64/kernel/cpu_errata.c
+@@ -143,15 +143,18 @@ void verify_local_cpu_errata_workarounds
+ {
+       const struct arm64_cpu_capabilities *caps = arm64_errata;
+-      for (; caps->matches; caps++)
+-              if (!cpus_have_cap(caps->capability) &&
+-                      caps->matches(caps, SCOPE_LOCAL_CPU)) {
++      for (; caps->matches; caps++) {
++              if (cpus_have_cap(caps->capability)) {
++                      if (caps->enable)
++                              caps->enable((void *)caps);
++              } else if (caps->matches(caps, SCOPE_LOCAL_CPU)) {
+                       pr_crit("CPU%d: Requires work around for %s, not detected"
+                                       " at boot time\n",
+                               smp_processor_id(),
+                               caps->desc ? : "an erratum");
+                       cpu_die_early();
+               }
++      }
+ }
+ void update_cpu_errata_workarounds(void)
diff --git a/queue-4.9/arm64-uaccess-don-t-bother-eliding-access_ok-checks-in-__-get-put-_user.patch b/queue-4.9/arm64-uaccess-don-t-bother-eliding-access_ok-checks-in-__-get-put-_user.patch
new file mode 100644 (file)
index 0000000..b9acc0a
--- /dev/null
@@ -0,0 +1,134 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:04 +0100
+Subject: [PATCH v4.9.y 08/42] arm64: uaccess: Don't bother eliding access_ok checks in __{get, put}_user
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-9-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 84624087dd7e3b482b7b11c170ebc1f329b3a218 upstream.
+
+access_ok isn't an expensive operation once the addr_limit for the current
+thread has been loaded into the cache. Given that the initial access_ok
+check preceding a sequence of __{get,put}_user operations will take
+the brunt of the miss, we can make the __* variants identical to the
+full-fat versions, which brings with it the benefits of address masking.
+
+The likely cost in these sequences will be from toggling PAN/UAO, which
+we can address later by implementing the *_unsafe versions.
+
+Reviewed-by: Robin Murphy <robin.murphy@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/uaccess.h |   62 ++++++++++++++++++++++-----------------
+ 1 file changed, 36 insertions(+), 26 deletions(-)
+
+--- a/arch/arm64/include/asm/uaccess.h
++++ b/arch/arm64/include/asm/uaccess.h
+@@ -209,30 +209,35 @@ do {                                                                     \
+                       CONFIG_ARM64_PAN));                             \
+ } while (0)
+-#define __get_user(x, ptr)                                            \
++#define __get_user_check(x, ptr, err)                                 \
+ ({                                                                    \
+-      int __gu_err = 0;                                               \
+-      __get_user_err((x), (ptr), __gu_err);                           \
+-      __gu_err;                                                       \
++      __typeof__(*(ptr)) __user *__p = (ptr);                         \
++      might_fault();                                                  \
++      if (access_ok(VERIFY_READ, __p, sizeof(*__p))) {                \
++              __p = uaccess_mask_ptr(__p);                            \
++              __get_user_err((x), __p, (err));                        \
++      } else {                                                        \
++              (x) = 0; (err) = -EFAULT;                               \
++      }                                                               \
+ })
+ #define __get_user_error(x, ptr, err)                                 \
+ ({                                                                    \
+-      __get_user_err((x), (ptr), (err));                              \
++      __get_user_check((x), (ptr), (err));                            \
+       (void)0;                                                        \
+ })
+-#define __get_user_unaligned __get_user
+-
+-#define get_user(x, ptr)                                              \
++#define __get_user(x, ptr)                                            \
+ ({                                                                    \
+-      __typeof__(*(ptr)) __user *__p = (ptr);                         \
+-      might_fault();                                                  \
+-      access_ok(VERIFY_READ, __p, sizeof(*__p)) ?                     \
+-              __p = uaccess_mask_ptr(__p), __get_user((x), __p) :     \
+-              ((x) = 0, -EFAULT);                                     \
++      int __gu_err = 0;                                               \
++      __get_user_check((x), (ptr), __gu_err);                         \
++      __gu_err;                                                       \
+ })
++#define __get_user_unaligned __get_user
++
++#define get_user      __get_user
++
+ #define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature)  \
+       asm volatile(                                                   \
+       "1:"ALTERNATIVE(instr "     " reg "1, [%2]\n",                  \
+@@ -277,30 +282,35 @@ do {                                                                     \
+                       CONFIG_ARM64_PAN));                             \
+ } while (0)
+-#define __put_user(x, ptr)                                            \
++#define __put_user_check(x, ptr, err)                                 \
+ ({                                                                    \
+-      int __pu_err = 0;                                               \
+-      __put_user_err((x), (ptr), __pu_err);                           \
+-      __pu_err;                                                       \
++      __typeof__(*(ptr)) __user *__p = (ptr);                         \
++      might_fault();                                                  \
++      if (access_ok(VERIFY_WRITE, __p, sizeof(*__p))) {               \
++              __p = uaccess_mask_ptr(__p);                            \
++              __put_user_err((x), __p, (err));                        \
++      } else  {                                                       \
++              (err) = -EFAULT;                                        \
++      }                                                               \
+ })
+ #define __put_user_error(x, ptr, err)                                 \
+ ({                                                                    \
+-      __put_user_err((x), (ptr), (err));                              \
++      __put_user_check((x), (ptr), (err));                            \
+       (void)0;                                                        \
+ })
+-#define __put_user_unaligned __put_user
+-
+-#define put_user(x, ptr)                                              \
++#define __put_user(x, ptr)                                            \
+ ({                                                                    \
+-      __typeof__(*(ptr)) __user *__p = (ptr);                         \
+-      might_fault();                                                  \
+-      access_ok(VERIFY_WRITE, __p, sizeof(*__p)) ?                    \
+-              __p = uaccess_mask_ptr(__p), __put_user((x), __p) :     \
+-              -EFAULT;                                                \
++      int __pu_err = 0;                                               \
++      __put_user_check((x), (ptr), __pu_err);                         \
++      __pu_err;                                                       \
+ })
++#define __put_user_unaligned __put_user
++
++#define put_user      __put_user
++
+ extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
+ extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n);
+ extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n);
diff --git a/queue-4.9/arm64-uaccess-mask-__user-pointers-for-__arch_-clear-copy_-_user.patch b/queue-4.9/arm64-uaccess-mask-__user-pointers-for-__arch_-clear-copy_-_user.patch
new file mode 100644 (file)
index 0000000..15e7ebe
--- /dev/null
@@ -0,0 +1,151 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:05 +0100
+Subject: [PATCH v4.9.y 09/42] arm64: uaccess: Mask __user pointers for __arch_{clear, copy_*}_user
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-10-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit f71c2ffcb20dd8626880747557014bb9a61eb90e upstream.
+
+Like we've done for get_user and put_user, ensure that user pointers
+are masked before invoking the underlying __arch_{clear,copy_*}_user
+operations.
+
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+[v4.9: fixup for v4.9-style uaccess primitives]
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/uaccess.h |   18 ++++++++++--------
+ arch/arm64/kernel/arm64ksyms.c   |    4 ++--
+ arch/arm64/lib/clear_user.S      |    6 +++---
+ arch/arm64/lib/copy_in_user.S    |    4 ++--
+ 4 files changed, 17 insertions(+), 15 deletions(-)
+
+--- a/arch/arm64/include/asm/uaccess.h
++++ b/arch/arm64/include/asm/uaccess.h
+@@ -313,21 +313,20 @@ do {                                                                     \
+ extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
+ extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n);
+-extern unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n);
+-extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n);
++extern unsigned long __must_check __arch_copy_in_user(void __user *to, const void __user *from, unsigned long n);
+ static inline unsigned long __must_check __copy_from_user(void *to, const void __user *from, unsigned long n)
+ {
+       kasan_check_write(to, n);
+       check_object_size(to, n, false);
+-      return __arch_copy_from_user(to, from, n);
++      return __arch_copy_from_user(to, __uaccess_mask_ptr(from), n);
+ }
+ static inline unsigned long __must_check __copy_to_user(void __user *to, const void *from, unsigned long n)
+ {
+       kasan_check_read(from, n);
+       check_object_size(from, n, true);
+-      return __arch_copy_to_user(to, from, n);
++      return __arch_copy_to_user(__uaccess_mask_ptr(to), from, n);
+ }
+ static inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n)
+@@ -355,22 +354,25 @@ static inline unsigned long __must_check
+       return n;
+ }
+-static inline unsigned long __must_check copy_in_user(void __user *to, const void __user *from, unsigned long n)
++static inline unsigned long __must_check __copy_in_user(void __user *to, const void __user *from, unsigned long n)
+ {
+       if (access_ok(VERIFY_READ, from, n) && access_ok(VERIFY_WRITE, to, n))
+-              n = __copy_in_user(to, from, n);
++              n = __arch_copy_in_user(__uaccess_mask_ptr(to), __uaccess_mask_ptr(from), n);
+       return n;
+ }
++#define copy_in_user __copy_in_user
+ #define __copy_to_user_inatomic __copy_to_user
+ #define __copy_from_user_inatomic __copy_from_user
+-static inline unsigned long __must_check clear_user(void __user *to, unsigned long n)
++extern unsigned long __must_check __arch_clear_user(void __user *to, unsigned long n);
++static inline unsigned long __must_check __clear_user(void __user *to, unsigned long n)
+ {
+       if (access_ok(VERIFY_WRITE, to, n))
+-              n = __clear_user(__uaccess_mask_ptr(to), n);
++              n = __arch_clear_user(__uaccess_mask_ptr(to), n);
+       return n;
+ }
++#define clear_user    __clear_user
+ extern long strncpy_from_user(char *dest, const char __user *src, long count);
+--- a/arch/arm64/kernel/arm64ksyms.c
++++ b/arch/arm64/kernel/arm64ksyms.c
+@@ -37,8 +37,8 @@ EXPORT_SYMBOL(clear_page);
+       /* user mem (segment) */
+ EXPORT_SYMBOL(__arch_copy_from_user);
+ EXPORT_SYMBOL(__arch_copy_to_user);
+-EXPORT_SYMBOL(__clear_user);
+-EXPORT_SYMBOL(__copy_in_user);
++EXPORT_SYMBOL(__arch_clear_user);
++EXPORT_SYMBOL(__arch_copy_in_user);
+       /* physical memory */
+ EXPORT_SYMBOL(memstart_addr);
+--- a/arch/arm64/lib/clear_user.S
++++ b/arch/arm64/lib/clear_user.S
+@@ -24,7 +24,7 @@
+       .text
+-/* Prototype: int __clear_user(void *addr, size_t sz)
++/* Prototype: int __arch_clear_user(void *addr, size_t sz)
+  * Purpose  : clear some user memory
+  * Params   : addr - user memory address to clear
+  *          : sz   - number of bytes to clear
+@@ -32,7 +32,7 @@
+  *
+  * Alignment fixed up by hardware.
+  */
+-ENTRY(__clear_user)
++ENTRY(__arch_clear_user)
+ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
+           CONFIG_ARM64_PAN)
+       mov     x2, x1                  // save the size for fixup return
+@@ -57,7 +57,7 @@ uao_user_alternative 9f, strb, sttrb, wz
+ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \
+           CONFIG_ARM64_PAN)
+       ret
+-ENDPROC(__clear_user)
++ENDPROC(__arch_clear_user)
+       .section .fixup,"ax"
+       .align  2
+--- a/arch/arm64/lib/copy_in_user.S
++++ b/arch/arm64/lib/copy_in_user.S
+@@ -67,7 +67,7 @@
+       .endm
+ end   .req    x5
+-ENTRY(__copy_in_user)
++ENTRY(__arch_copy_in_user)
+ ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \
+           CONFIG_ARM64_PAN)
+       add     end, x0, x2
+@@ -76,7 +76,7 @@ ALTERNATIVE("nop", __stringify(SET_PSTAT
+           CONFIG_ARM64_PAN)
+       mov     x0, #0
+       ret
+-ENDPROC(__copy_in_user)
++ENDPROC(__arch_copy_in_user)
+       .section .fixup,"ax"
+       .align  2
diff --git a/queue-4.9/arm64-uaccess-prevent-speculative-use-of-the-current-addr_limit.patch b/queue-4.9/arm64-uaccess-prevent-speculative-use-of-the-current-addr_limit.patch
new file mode 100644 (file)
index 0000000..9c1d3ef
--- /dev/null
@@ -0,0 +1,49 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:03 +0100
+Subject: [PATCH v4.9.y 07/42] arm64: uaccess: Prevent speculative use of the current addr_limit
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-8-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit c2f0ad4fc089cff81cef6a13d04b399980ecbfcc upstream.
+
+A mispredicted conditional call to set_fs could result in the wrong
+addr_limit being forwarded under speculation to a subsequent access_ok
+check, potentially forming part of a spectre-v1 attack using uaccess
+routines.
+
+This patch prevents this forwarding from taking place, but putting heavy
+barriers in set_fs after writing the addr_limit.
+
+Reviewed-by: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/uaccess.h |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/arch/arm64/include/asm/uaccess.h
++++ b/arch/arm64/include/asm/uaccess.h
+@@ -68,6 +68,13 @@ static inline void set_fs(mm_segment_t f
+       current_thread_info()->addr_limit = fs;
+       /*
++       * Prevent a mispredicted conditional call to set_fs from forwarding
++       * the wrong address limit to access_ok under speculation.
++       */
++      dsb(nsh);
++      isb();
++
++      /*
+        * Enable/disable UAO so that copy_to_user() etc can access
+        * kernel memory with the unprivileged instructions.
+        */
diff --git a/queue-4.9/arm64-use-pointer-masking-to-limit-uaccess-speculation.patch b/queue-4.9/arm64-use-pointer-masking-to-limit-uaccess-speculation.patch
new file mode 100644 (file)
index 0000000..b6e3336
--- /dev/null
@@ -0,0 +1,89 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:01 +0100
+Subject: [PATCH v4.9.y 05/42] arm64: Use pointer masking to limit uaccess speculation
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-6-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Robin Murphy <robin.murphy@arm.com>
+
+commit 4d8efc2d5ee4c9ccfeb29ee8afd47a8660d0c0ce upstream.
+
+Similarly to x86, mitigate speculation past an access_ok() check by
+masking the pointer against the address limit before use.
+
+Even if we don't expect speculative writes per se, it is plausible that
+a CPU may still speculate at least as far as fetching a cache line for
+writing, hence we also harden put_user() and clear_user() for peace of
+mind.
+
+Signed-off-by: Robin Murphy <robin.murphy@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/arm64/include/asm/uaccess.h |   26 +++++++++++++++++++++++---
+ 1 file changed, 23 insertions(+), 3 deletions(-)
+
+--- a/arch/arm64/include/asm/uaccess.h
++++ b/arch/arm64/include/asm/uaccess.h
+@@ -129,6 +129,26 @@ static inline unsigned long __range_ok(u
+       "       .popsection\n"
+ /*
++ * Sanitise a uaccess pointer such that it becomes NULL if above the
++ * current addr_limit.
++ */
++#define uaccess_mask_ptr(ptr) (__typeof__(ptr))__uaccess_mask_ptr(ptr)
++static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
++{
++      void __user *safe_ptr;
++
++      asm volatile(
++      "       bics    xzr, %1, %2\n"
++      "       csel    %0, %1, xzr, eq\n"
++      : "=&r" (safe_ptr)
++      : "r" (ptr), "r" (current_thread_info()->addr_limit)
++      : "cc");
++
++      csdb();
++      return safe_ptr;
++}
++
++/*
+  * The "__xxx" versions of the user access functions do not verify the address
+  * space - it must have been done previously with a separate "access_ok()"
+  * call.
+@@ -202,7 +222,7 @@ do {                                                                       \
+       __typeof__(*(ptr)) __user *__p = (ptr);                         \
+       might_fault();                                                  \
+       access_ok(VERIFY_READ, __p, sizeof(*__p)) ?                     \
+-              __get_user((x), __p) :                                  \
++              __p = uaccess_mask_ptr(__p), __get_user((x), __p) :     \
+               ((x) = 0, -EFAULT);                                     \
+ })
+@@ -270,7 +290,7 @@ do {                                                                       \
+       __typeof__(*(ptr)) __user *__p = (ptr);                         \
+       might_fault();                                                  \
+       access_ok(VERIFY_WRITE, __p, sizeof(*__p)) ?                    \
+-              __put_user((x), __p) :                                  \
++              __p = uaccess_mask_ptr(__p), __put_user((x), __p) :     \
+               -EFAULT;                                                \
+ })
+@@ -331,7 +351,7 @@ static inline unsigned long __must_check
+ static inline unsigned long __must_check clear_user(void __user *to, unsigned long n)
+ {
+       if (access_ok(VERIFY_WRITE, to, n))
+-              n = __clear_user(to, n);
++              n = __clear_user(__uaccess_mask_ptr(to), n);
+       return n;
+ }
diff --git a/queue-4.9/drivers-firmware-expose-psci_get_version-through-psci_ops-structure.patch b/queue-4.9/drivers-firmware-expose-psci_get_version-through-psci_ops-structure.patch
new file mode 100644 (file)
index 0000000..6538a52
--- /dev/null
@@ -0,0 +1,53 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:10 +0100
+Subject: [PATCH v4.9.y 14/42] drivers/firmware: Expose psci_get_version through psci_ops structure
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-15-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit d68e3ba5303f7e1099f51fdcd155f5263da8569b upstream.
+
+Entry into recent versions of ARM Trusted Firmware will invalidate the CPU
+branch predictor state in order to protect against aliasing attacks.
+
+This patch exposes the PSCI "VERSION" function via psci_ops, so that it
+can be invoked outside of the PSCI driver where necessary.
+
+Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/firmware/psci.c |    2 ++
+ include/linux/psci.h    |    1 +
+ 2 files changed, 3 insertions(+)
+
+--- a/drivers/firmware/psci.c
++++ b/drivers/firmware/psci.c
+@@ -496,6 +496,8 @@ static void __init psci_init_migrate(voi
+ static void __init psci_0_2_set_functions(void)
+ {
+       pr_info("Using standard PSCI v0.2 function IDs\n");
++      psci_ops.get_version = psci_get_version;
++
+       psci_function_id[PSCI_FN_CPU_SUSPEND] =
+                                       PSCI_FN_NATIVE(0_2, CPU_SUSPEND);
+       psci_ops.cpu_suspend = psci_cpu_suspend;
+--- a/include/linux/psci.h
++++ b/include/linux/psci.h
+@@ -26,6 +26,7 @@ int psci_cpu_init_idle(unsigned int cpu)
+ int psci_cpu_suspend_enter(unsigned long index);
+ struct psci_operations {
++      u32 (*get_version)(void);
+       int (*cpu_suspend)(u32 state, unsigned long entry_point);
+       int (*cpu_off)(u32 state);
+       int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
diff --git a/queue-4.9/firmware-psci-expose-psci-conduit.patch b/queue-4.9/firmware-psci-expose-psci-conduit.patch
new file mode 100644 (file)
index 0000000..baa52fe
--- /dev/null
@@ -0,0 +1,115 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:33 +0100
+Subject: [PATCH v4.9.y 37/42] firmware/psci: Expose PSCI conduit
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-38-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit 09a8d6d48499f93e2abde691f5800081cd858726 upstream.
+
+In order to call into the firmware to apply workarounds, it is
+useful to find out whether we're using HVC or SMC. Let's expose
+this through the psci_ops.
+
+Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+Reviewed-by: Robin Murphy <robin.murphy@arm.com>
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/firmware/psci.c |   28 +++++++++++++++++++++++-----
+ include/linux/psci.h    |    7 +++++++
+ 2 files changed, 30 insertions(+), 5 deletions(-)
+
+--- a/drivers/firmware/psci.c
++++ b/drivers/firmware/psci.c
+@@ -59,7 +59,9 @@ bool psci_tos_resident_on(int cpu)
+       return cpu == resident_cpu;
+ }
+-struct psci_operations psci_ops;
++struct psci_operations psci_ops = {
++      .conduit = PSCI_CONDUIT_NONE,
++};
+ typedef unsigned long (psci_fn)(unsigned long, unsigned long,
+                               unsigned long, unsigned long);
+@@ -210,6 +212,22 @@ static unsigned long psci_migrate_info_u
+                             0, 0, 0);
+ }
++static void set_conduit(enum psci_conduit conduit)
++{
++      switch (conduit) {
++      case PSCI_CONDUIT_HVC:
++              invoke_psci_fn = __invoke_psci_fn_hvc;
++              break;
++      case PSCI_CONDUIT_SMC:
++              invoke_psci_fn = __invoke_psci_fn_smc;
++              break;
++      default:
++              WARN(1, "Unexpected PSCI conduit %d\n", conduit);
++      }
++
++      psci_ops.conduit = conduit;
++}
++
+ static int get_set_conduit_method(struct device_node *np)
+ {
+       const char *method;
+@@ -222,9 +240,9 @@ static int get_set_conduit_method(struct
+       }
+       if (!strcmp("hvc", method)) {
+-              invoke_psci_fn = __invoke_psci_fn_hvc;
++              set_conduit(PSCI_CONDUIT_HVC);
+       } else if (!strcmp("smc", method)) {
+-              invoke_psci_fn = __invoke_psci_fn_smc;
++              set_conduit(PSCI_CONDUIT_SMC);
+       } else {
+               pr_warn("invalid \"method\" property: %s\n", method);
+               return -EINVAL;
+@@ -654,9 +672,9 @@ int __init psci_acpi_init(void)
+       pr_info("probing for conduit method from ACPI.\n");
+       if (acpi_psci_use_hvc())
+-              invoke_psci_fn = __invoke_psci_fn_hvc;
++              set_conduit(PSCI_CONDUIT_HVC);
+       else
+-              invoke_psci_fn = __invoke_psci_fn_smc;
++              set_conduit(PSCI_CONDUIT_SMC);
+       return psci_probe();
+ }
+--- a/include/linux/psci.h
++++ b/include/linux/psci.h
+@@ -25,6 +25,12 @@ bool psci_tos_resident_on(int cpu);
+ int psci_cpu_init_idle(unsigned int cpu);
+ int psci_cpu_suspend_enter(unsigned long index);
++enum psci_conduit {
++      PSCI_CONDUIT_NONE,
++      PSCI_CONDUIT_SMC,
++      PSCI_CONDUIT_HVC,
++};
++
+ struct psci_operations {
+       u32 (*get_version)(void);
+       int (*cpu_suspend)(u32 state, unsigned long entry_point);
+@@ -34,6 +40,7 @@ struct psci_operations {
+       int (*affinity_info)(unsigned long target_affinity,
+                       unsigned long lowest_affinity_level);
+       int (*migrate_info_type)(void);
++      enum psci_conduit conduit;
+ };
+ extern struct psci_operations psci_ops;
diff --git a/queue-4.9/firmware-psci-expose-smccc-version-through-psci_ops.patch b/queue-4.9/firmware-psci-expose-smccc-version-through-psci_ops.patch
new file mode 100644 (file)
index 0000000..2042f41
--- /dev/null
@@ -0,0 +1,104 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:34 +0100
+Subject: [PATCH v4.9.y 38/42] firmware/psci: Expose SMCCC version through psci_ops
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-39-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Marc Zyngier <marc.zyngier@arm.com>
+
+commit e78eef554a912ef6c1e0bbf97619dafbeae3339f upstream.
+
+Since PSCI 1.0 allows the SMCCC version to be (indirectly) probed,
+let's do that at boot time, and expose the version of the calling
+convention as part of the psci_ops structure.
+
+Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
+Reviewed-by: Robin Murphy <robin.murphy@arm.com>
+Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/firmware/psci.c |   27 +++++++++++++++++++++++++++
+ include/linux/psci.h    |    6 ++++++
+ 2 files changed, 33 insertions(+)
+
+--- a/drivers/firmware/psci.c
++++ b/drivers/firmware/psci.c
+@@ -61,6 +61,7 @@ bool psci_tos_resident_on(int cpu)
+ struct psci_operations psci_ops = {
+       .conduit = PSCI_CONDUIT_NONE,
++      .smccc_version = SMCCC_VERSION_1_0,
+ };
+ typedef unsigned long (psci_fn)(unsigned long, unsigned long,
+@@ -511,6 +512,31 @@ static void __init psci_init_migrate(voi
+       pr_info("Trusted OS resident on physical CPU 0x%lx\n", cpuid);
+ }
++static void __init psci_init_smccc(void)
++{
++      u32 ver = ARM_SMCCC_VERSION_1_0;
++      int feature;
++
++      feature = psci_features(ARM_SMCCC_VERSION_FUNC_ID);
++
++      if (feature != PSCI_RET_NOT_SUPPORTED) {
++              u32 ret;
++              ret = invoke_psci_fn(ARM_SMCCC_VERSION_FUNC_ID, 0, 0, 0);
++              if (ret == ARM_SMCCC_VERSION_1_1) {
++                      psci_ops.smccc_version = SMCCC_VERSION_1_1;
++                      ver = ret;
++              }
++      }
++
++      /*
++       * Conveniently, the SMCCC and PSCI versions are encoded the
++       * same way. No, this isn't accidental.
++       */
++      pr_info("SMC Calling Convention v%d.%d\n",
++              PSCI_VERSION_MAJOR(ver), PSCI_VERSION_MINOR(ver));
++
++}
++
+ static void __init psci_0_2_set_functions(void)
+ {
+       pr_info("Using standard PSCI v0.2 function IDs\n");
+@@ -559,6 +585,7 @@ static int __init psci_probe(void)
+       psci_init_migrate();
+       if (PSCI_VERSION_MAJOR(ver) >= 1) {
++              psci_init_smccc();
+               psci_init_cpu_suspend();
+               psci_init_system_suspend();
+       }
+--- a/include/linux/psci.h
++++ b/include/linux/psci.h
+@@ -31,6 +31,11 @@ enum psci_conduit {
+       PSCI_CONDUIT_HVC,
+ };
++enum smccc_version {
++      SMCCC_VERSION_1_0,
++      SMCCC_VERSION_1_1,
++};
++
+ struct psci_operations {
+       u32 (*get_version)(void);
+       int (*cpu_suspend)(u32 state, unsigned long entry_point);
+@@ -41,6 +46,7 @@ struct psci_operations {
+                       unsigned long lowest_affinity_level);
+       int (*migrate_info_type)(void);
+       enum psci_conduit conduit;
++      enum smccc_version smccc_version;
+ };
+ extern struct psci_operations psci_ops;
diff --git a/queue-4.9/mm-introduce-lm_alias.patch b/queue-4.9/mm-introduce-lm_alias.patch
new file mode 100644 (file)
index 0000000..3e91c25
--- /dev/null
@@ -0,0 +1,44 @@
+From foo@baz Tue Apr 17 14:06:43 CEST 2018
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Thu, 12 Apr 2018 12:11:15 +0100
+Subject: [PATCH v4.9.y 19/42] mm: Introduce lm_alias
+To: stable@vger.kernel.org
+Cc: mark.brown@linaro.org, ard.biesheuvel@linaro.org, marc.zyngier@arm.com, will.deacon@arm.com, catalin.marinas@arm.com, ghackmann@google.com, shankerd@codeaurora.org
+Message-ID: <20180412111138.40990-20-mark.rutland@arm.com>
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+
+From: Laura Abbott <labbott@redhat.com>
+
+commit 568c5fe5a54f2654f5a4c599c45b8a62ed9a2013 upstream.
+
+Certain architectures may have the kernel image mapped separately to
+alias the linear map. Introduce a macro lm_alias to translate a kernel
+image symbol into its linear alias. This is used in part with work to
+add CONFIG_DEBUG_VIRTUAL support for arm64.
+
+Reviewed-by: Mark Rutland <mark.rutland@arm.com>
+Tested-by: Mark Rutland <mark.rutland@arm.com>
+Signed-off-by: Laura Abbott <labbott@redhat.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Mark Rutland <mark.rutland@arm.com> [v4.9 backport]
+Tested-by: Greg Hackmann <ghackmann@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/mm.h |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -76,6 +76,10 @@ extern int mmap_rnd_compat_bits __read_m
+ #define page_to_virt(x)       __va(PFN_PHYS(page_to_pfn(x)))
+ #endif
++#ifndef lm_alias
++#define lm_alias(x)   __va(__pa_symbol(x))
++#endif
++
+ /*
+  * To prevent common memory management code establishing
+  * a zero page mapping on a read fault.
index ffbd02068553fbbd2b37149a8a94de3a11d54371..8c8f18e72238fe93736d555f29de4094fde063f8 100644 (file)
@@ -7,3 +7,44 @@ perf-intel-pt-fix-error-recovery-from-missing-tip-packet.patch
 perf-intel-pt-fix-timestamp-following-overflow.patch
 perf-core-fix-use-after-free-in-uprobe_perf_close.patch
 radeon-hide-pointless-warning-when-compile-testing.patch
+arm64-barrier-add-csdb-macros-to-control-data-value-prediction.patch
+arm64-implement-array_index_mask_nospec.patch
+arm64-move-task_-definitions-to-asm-processor.h.patch
+arm64-make-user_ds-an-inclusive-limit.patch
+arm64-use-pointer-masking-to-limit-uaccess-speculation.patch
+arm64-entry-ensure-branch-through-syscall-table-is-bounded-under-speculation.patch
+arm64-uaccess-prevent-speculative-use-of-the-current-addr_limit.patch
+arm64-uaccess-don-t-bother-eliding-access_ok-checks-in-__-get-put-_user.patch
+arm64-uaccess-mask-__user-pointers-for-__arch_-clear-copy_-_user.patch
+arm64-cpufeature-__this_cpu_has_cap-shouldn-t-stop-early.patch
+arm64-run-enable-method-for-errata-work-arounds-on-late-cpus.patch
+arm64-cpufeature-pass-capability-structure-to-enable-callback.patch
+drivers-firmware-expose-psci_get_version-through-psci_ops-structure.patch
+arm64-factor-out-ttbr0_el1-post-update-workaround-into-a-specific-asm-macro.patch
+arm64-move-post_ttbr_update_workaround-to-c-code.patch
+arm64-add-skeleton-to-harden-the-branch-predictor-against-aliasing-attacks.patch
+arm64-move-bp-hardening-to-check_and_switch_context.patch
+mm-introduce-lm_alias.patch
+arm64-kvm-use-per-cpu-vector-when-bp-hardening-is-enabled.patch
+arm64-entry-apply-bp-hardening-for-high-priority-synchronous-exceptions.patch
+arm64-entry-apply-bp-hardening-for-suspicious-interrupts-from-el0.patch
+arm64-cputype-add-missing-midr-values-for-cortex-a72-and-cortex-a75.patch
+arm64-cpu_errata-allow-an-erratum-to-be-match-for-all-revisions-of-a-core.patch
+arm64-implement-branch-predictor-hardening-for-affected-cortex-a-cpus.patch
+arm64-branch-predictor-hardening-for-cavium-thunderx2.patch
+arm64-kvm-increment-pc-after-handling-an-smc-trap.patch
+arm-arm64-kvm-consolidate-the-psci-include-files.patch
+arm-arm64-kvm-add-psci_version-helper.patch
+arm-arm64-kvm-add-smccc-accessors-to-psci-code.patch
+arm-arm64-kvm-implement-psci-1.0-support.patch
+arm-arm64-kvm-advertise-smccc-v1.1.patch
+arm64-kvm-make-psci_version-a-fast-path.patch
+arm-arm64-kvm-turn-kvm_psci_version-into-a-static-inline.patch
+arm64-kvm-report-smccc_arch_workaround_1-bp-hardening-support.patch
+arm64-kvm-add-smccc_arch_workaround_1-fast-handling.patch
+firmware-psci-expose-psci-conduit.patch
+firmware-psci-expose-smccc-version-through-psci_ops.patch
+arm-arm64-smccc-make-function-identifiers-an-unsigned-quantity.patch
+arm-arm64-smccc-implement-smccc-v1.1-inline-primitive.patch
+arm64-add-arm_smccc_arch_workaround_1-bp-hardening-support.patch
+arm64-kill-psci_get_version-as-a-variant-2-workaround.patch