]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
Merge patch series "riscv/barrier: tidying up barrier-related macro"
authorPalmer Dabbelt <palmer@rivosinc.com>
Wed, 20 Mar 2024 01:52:27 +0000 (18:52 -0700)
committerPalmer Dabbelt <palmer@rivosinc.com>
Wed, 20 Mar 2024 15:56:12 +0000 (08:56 -0700)
Eric Chan <ericchancf@google.com> says:

This series makes barrier-related macro more neat and clear.
This is a follow-up to [0-3], change to multiple patches,
for readability, create new message thread.

[0](v1/v2) https://lore.kernel.org/lkml/20240209125048.4078639-1-ericchancf@google.com/
[1] (v3) https://lore.kernel.org/lkml/20240213142856.2416073-1-ericchancf@google.com/
[2] (v4) https://lore.kernel.org/lkml/20240213200923.2547570-1-ericchancf@google.com/
[4] (v5) https://lore.kernel.org/lkml/20240213223810.2595804-1-ericchancf@google.com/

* b4-shazam-merge:
  riscv/barrier: Add missing space after ','
  riscv/barrier: Consolidate fence definitions
  riscv/barrier: Define RISCV_FULL_BARRIER
  riscv/barrier: Define __{mb,rmb,wmb}

Link: https://lore.kernel.org/r/20240217131206.3667544-1-ericchancf@google.com
Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
102 files changed:
Documentation/arch/riscv/vm-layout.rst
Documentation/devicetree/bindings/riscv/cpus.yaml
Documentation/devicetree/bindings/riscv/extensions.yaml
Documentation/features/sched/membarrier-sync-core/arch-support.txt
Documentation/scheduler/index.rst
Documentation/scheduler/membarrier.rst [new file with mode: 0644]
MAINTAINERS
arch/riscv/Kbuild
arch/riscv/Kconfig
arch/riscv/Makefile
arch/riscv/boot/dts/renesas/r9a07g043f.dtsi
arch/riscv/configs/defconfig
arch/riscv/crypto/Kconfig [new file with mode: 0644]
arch/riscv/crypto/Makefile [new file with mode: 0644]
arch/riscv/crypto/aes-macros.S [new file with mode: 0644]
arch/riscv/crypto/aes-riscv64-glue.c [new file with mode: 0644]
arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S [new file with mode: 0644]
arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S [new file with mode: 0644]
arch/riscv/crypto/aes-riscv64-zvkned.S [new file with mode: 0644]
arch/riscv/crypto/chacha-riscv64-glue.c [new file with mode: 0644]
arch/riscv/crypto/chacha-riscv64-zvkb.S [new file with mode: 0644]
arch/riscv/crypto/ghash-riscv64-glue.c [new file with mode: 0644]
arch/riscv/crypto/ghash-riscv64-zvkg.S [new file with mode: 0644]
arch/riscv/crypto/sha256-riscv64-glue.c [new file with mode: 0644]
arch/riscv/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S [new file with mode: 0644]
arch/riscv/crypto/sha512-riscv64-glue.c [new file with mode: 0644]
arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S [new file with mode: 0644]
arch/riscv/crypto/sm3-riscv64-glue.c [new file with mode: 0644]
arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S [new file with mode: 0644]
arch/riscv/crypto/sm4-riscv64-glue.c [new file with mode: 0644]
arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S [new file with mode: 0644]
arch/riscv/errata/andes/errata.c
arch/riscv/include/asm/asm.h
arch/riscv/include/asm/bitops.h
arch/riscv/include/asm/compat.h
arch/riscv/include/asm/cpufeature.h
arch/riscv/include/asm/elf.h
arch/riscv/include/asm/errata_list.h
arch/riscv/include/asm/hwcap.h
arch/riscv/include/asm/membarrier.h [new file with mode: 0644]
arch/riscv/include/asm/pgalloc.h
arch/riscv/include/asm/pgtable.h
arch/riscv/include/asm/processor.h
arch/riscv/include/asm/simd.h
arch/riscv/include/asm/suspend.h
arch/riscv/include/asm/sync_core.h [new file with mode: 0644]
arch/riscv/include/asm/tlb.h
arch/riscv/include/asm/vector.h
arch/riscv/include/asm/vendorid_list.h
arch/riscv/include/asm/vmalloc.h
arch/riscv/kernel/Makefile
arch/riscv/kernel/alternative.c
arch/riscv/kernel/cpufeature.c
arch/riscv/kernel/entry.S
arch/riscv/kernel/pi/Makefile
arch/riscv/kernel/ptrace.c
arch/riscv/kernel/smpboot.c
arch/riscv/kernel/suspend.c
arch/riscv/kernel/sys_hwprobe.c
arch/riscv/kernel/traps.c
arch/riscv/kernel/traps_misaligned.c
arch/riscv/kernel/unaligned_access_speed.c [new file with mode: 0644]
arch/riscv/lib/csum.c
arch/riscv/lib/uaccess_vector.S
arch/riscv/mm/cacheflush.c
arch/riscv/mm/context.c
arch/riscv/mm/init.c
arch/riscv/mm/pgtable.c
crypto/Kconfig
drivers/acpi/Kconfig
drivers/acpi/riscv/Makefile
drivers/acpi/riscv/cppc.c [new file with mode: 0644]
drivers/acpi/riscv/cpuidle.c [new file with mode: 0644]
drivers/clocksource/timer-clint.c
drivers/clocksource/timer-riscv.c
drivers/cpufreq/Kconfig
drivers/cpufreq/Kconfig.arm
drivers/cpuidle/cpuidle-riscv-sbi.c
drivers/irqchip/irq-riscv-intc.c
drivers/perf/Kconfig
drivers/perf/riscv_pmu_sbi.c
include/asm-generic/bitops/__ffs.h
include/asm-generic/bitops/__fls.h
include/asm-generic/bitops/ffs.h
include/asm-generic/bitops/fls.h
include/linux/mm.h
include/linux/soc/andes/irq.h [new file with mode: 0644]
include/linux/sync_core.h
init/Kconfig
kernel/sched/core.c
kernel/sched/membarrier.c
mm/mmap.c
scripts/Kconfig.include
scripts/Makefile.compiler
tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json [new file with mode: 0644]
tools/perf/pmu-events/arch/riscv/andes/ax45/instructions.json [new file with mode: 0644]
tools/perf/pmu-events/arch/riscv/andes/ax45/memory.json [new file with mode: 0644]
tools/perf/pmu-events/arch/riscv/andes/ax45/microarch.json [new file with mode: 0644]
tools/perf/pmu-events/arch/riscv/mapfile.csv
tools/testing/selftests/riscv/mm/mmap_bottomup.c
tools/testing/selftests/riscv/mm/mmap_default.c
tools/testing/selftests/riscv/mm/mmap_test.h

index 69ff6da1dbf8f3b64898981d811326fdb2391c1d..e476b4386bd9dba3c3f59b501369e169fecc003a 100644 (file)
@@ -144,14 +144,8 @@ passing 0 into the hint address parameter of mmap. On CPUs with an address space
 smaller than sv48, the CPU maximum supported address space will be the default.
 
 Software can "opt-in" to receiving VAs from another VA space by providing
-a hint address to mmap. A hint address passed to mmap will cause the largest
-address space that fits entirely into the hint to be used, unless there is no
-space left in the address space. If there is no space available in the requested
-address space, an address in the next smallest available address space will be
-returned.
-
-For example, in order to obtain 48-bit VA space, a hint address greater than
-:code:`1 << 47` must be provided. Note that this is 47 due to sv48 userspace
-ending at :code:`1 << 47` and the addresses beyond this are reserved for the
-kernel. Similarly, to obtain 57-bit VA space addresses, a hint address greater
-than or equal to :code:`1 << 56` must be provided.
+a hint address to mmap. When a hint address is passed to mmap, the returned
+address will never use more bits than the hint address. For example, if a hint
+address of `1 << 40` is passed to mmap, a valid returned address will never use
+bits 41 through 63. If no mappable addresses are available in that range, mmap
+will return `MAP_FAILED`.
index 9d8670c00e3b3bdea5d2196b98538d97465abf0d..6ccd75cbbc59d17579a44420ffd96ecdef8f69fe 100644 (file)
@@ -106,7 +106,11 @@ properties:
         const: 1
 
       compatible:
-        const: riscv,cpu-intc
+        oneOf:
+          - items:
+              - const: andestech,cpu-intc
+              - const: riscv,cpu-intc
+          - const: riscv,cpu-intc
 
       interrupt-controller: true
 
index 63d81dc895e5ce4c08715ce1d6bf0958a757ca86..468c646247aa5cebbea5cbe839c01cfacbaecf7e 100644 (file)
@@ -477,5 +477,12 @@ properties:
             latency, as ratified in commit 56ed795 ("Update
             riscv-crypto-spec-vector.adoc") of riscv-crypto.
 
+        - const: xandespmu
+          description:
+            The Andes Technology performance monitor extension for counter overflow
+            and privilege mode filtering. For more details, see Counter Related
+            Registers in the AX45MP datasheet.
+            https://www.andestech.com/wp-content/uploads/AX45MP-1C-Rev.-5.0.0-Datasheet.pdf
+
 additionalProperties: true
 ...
index d96b778b87ed8e8c19b457340a3386ef186f19bc..7425d2b994a3997447a42a43bf924c7be401a6d7 100644 (file)
 # Rely on implicit context synchronization as a result of exception return
 # when returning from IPI handler, and when returning to user-space.
 #
+# * riscv
+#
+# riscv uses xRET as return from interrupt and to return to user-space.
+#
+# Given that xRET is not core serializing, we rely on FENCE.I for providing
+# core serialization:
+#
+#  - by calling sync_core_before_usermode() on return from interrupt (cf.
+#    ipi_sync_core()),
+#
+#  - via switch_mm() and sync_core_before_usermode() (respectively, for
+#    uthread->uthread and kthread->uthread transitions) before returning
+#    to user-space.
+#
+#  The serialization in switch_mm() is activated by prepare_sync_core_cmd().
+#
 # * x86
 #
 # x86-32 uses IRET as return from interrupt, which takes care of the IPI.
@@ -43,7 +59,7 @@
     |    openrisc: | TODO |
     |      parisc: | TODO |
     |     powerpc: |  ok  |
-    |       riscv: | TODO |
+    |       riscv: |  ok  |
     |        s390: |  ok  |
     |          sh: | TODO |
     |       sparc: | TODO |
index 3170747226f6da7f828223f6504704391fe47314..43bd8a145b7a9b380e4706b751e40ab7fe05138a 100644 (file)
@@ -7,6 +7,7 @@ Scheduler
 
 
     completion
+    membarrier
     sched-arch
     sched-bwc
     sched-deadline
diff --git a/Documentation/scheduler/membarrier.rst b/Documentation/scheduler/membarrier.rst
new file mode 100644 (file)
index 0000000..2387804
--- /dev/null
@@ -0,0 +1,39 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
+membarrier() System Call
+========================
+
+MEMBARRIER_CMD_{PRIVATE,GLOBAL}_EXPEDITED - Architecture requirements
+=====================================================================
+
+Memory barriers before updating rq->curr
+----------------------------------------
+
+The commands MEMBARRIER_CMD_PRIVATE_EXPEDITED and MEMBARRIER_CMD_GLOBAL_EXPEDITED
+require each architecture to have a full memory barrier after coming from
+user-space, before updating rq->curr.  This barrier is implied by the sequence
+rq_lock(); smp_mb__after_spinlock() in __schedule().  The barrier matches a full
+barrier in the proximity of the membarrier system call exit, cf.
+membarrier_{private,global}_expedited().
+
+Memory barriers after updating rq->curr
+---------------------------------------
+
+The commands MEMBARRIER_CMD_PRIVATE_EXPEDITED and MEMBARRIER_CMD_GLOBAL_EXPEDITED
+require each architecture to have a full memory barrier after updating rq->curr,
+before returning to user-space.  The schemes providing this barrier on the various
+architectures are as follows.
+
+ - alpha, arc, arm, hexagon, mips rely on the full barrier implied by
+   spin_unlock() in finish_lock_switch().
+
+ - arm64 relies on the full barrier implied by switch_to().
+
+ - powerpc, riscv, s390, sparc, x86 rely on the full barrier implied by
+   switch_mm(), if mm is not NULL; they rely on the full barrier implied
+   by mmdrop(), otherwise.  On powerpc and riscv, switch_mm() relies on
+   membarrier_arch_switch_mm().
+
+The barrier matches a full barrier in the proximity of the membarrier system call
+entry, cf. membarrier_{private,global}_expedited().
index 8d1052fa6a6924d17a4d2681fa7907c544e35186..cc80968ec355d15185cdd5ff43c253ddbe3e400a 100644 (file)
@@ -14039,7 +14039,9 @@ M:      Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 M:     "Paul E. McKenney" <paulmck@kernel.org>
 L:     linux-kernel@vger.kernel.org
 S:     Supported
-F:     arch/powerpc/include/asm/membarrier.h
+F:     Documentation/scheduler/membarrier.rst
+F:     arch/*/include/asm/membarrier.h
+F:     arch/*/include/asm/sync_core.h
 F:     include/uapi/linux/membarrier.h
 F:     kernel/sched/membarrier.c
 
index d25ad1c19f881d0ed299a191908af885b1baa91b..2c585f7a0b6ef325e0954d88e64a3c2310af3740 100644 (file)
@@ -2,6 +2,7 @@
 
 obj-y += kernel/ mm/ net/
 obj-$(CONFIG_BUILTIN_DTB) += boot/dts/
+obj-$(CONFIG_CRYPTO) += crypto/
 obj-y += errata/
 obj-$(CONFIG_KVM) += kvm/
 
index bffbd869a0682842883591788da784648acf1626..8ebafe337eac9880970ff877065c3f7161375313 100644 (file)
@@ -27,14 +27,18 @@ config RISCV
        select ARCH_HAS_GCOV_PROFILE_ALL
        select ARCH_HAS_GIGANTIC_PAGE
        select ARCH_HAS_KCOV
+       select ARCH_HAS_MEMBARRIER_CALLBACKS
+       select ARCH_HAS_MEMBARRIER_SYNC_CORE
        select ARCH_HAS_MMIOWB
        select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
        select ARCH_HAS_PMEM_API
+       select ARCH_HAS_PREPARE_SYNC_CORE_CMD
        select ARCH_HAS_PTE_SPECIAL
        select ARCH_HAS_SET_DIRECT_MAP if MMU
        select ARCH_HAS_SET_MEMORY if MMU
        select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
        select ARCH_HAS_STRICT_MODULE_RWX if MMU && !XIP_KERNEL
+       select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
        select ARCH_HAS_SYSCALL_WRAPPER
        select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
        select ARCH_HAS_UBSAN_SANITIZE_ALL
@@ -47,6 +51,9 @@ config RISCV
        select ARCH_SUPPORTS_CFI_CLANG
        select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU
        select ARCH_SUPPORTS_HUGETLBFS if MMU
+       # LLD >= 14: https://github.com/llvm/llvm-project/issues/50505
+       select ARCH_SUPPORTS_LTO_CLANG if LLD_VERSION >= 140000
+       select ARCH_SUPPORTS_LTO_CLANG_THIN if LLD_VERSION >= 140000
        select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU
        select ARCH_SUPPORTS_PER_VMA_LOCK if MMU
        select ARCH_SUPPORTS_SHADOW_CALL_STACK if HAVE_SHADOW_CALL_STACK
@@ -106,6 +113,7 @@ config RISCV
        select HAVE_ARCH_KGDB_QXFER_PKT
        select HAVE_ARCH_MMAP_RND_BITS if MMU
        select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
+       select HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET
        select HAVE_ARCH_SECCOMP_FILTER
        select HAVE_ARCH_THREAD_STRUCT_WHITELIST
        select HAVE_ARCH_TRACEHOOK
@@ -124,6 +132,7 @@ config RISCV
        select HAVE_FUNCTION_GRAPH_RETVAL if HAVE_FUNCTION_GRAPH_TRACER
        select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !PREEMPTION
        select HAVE_EBPF_JIT if MMU
+       select HAVE_FAST_GUP if MMU
        select HAVE_FUNCTION_ARG_ACCESS_API
        select HAVE_FUNCTION_ERROR_INJECTION
        select HAVE_GCC_PLUGINS
@@ -154,6 +163,7 @@ config RISCV
        select IRQ_FORCED_THREADING
        select KASAN_VMALLOC if KASAN
        select LOCK_MM_AND_FIND_VMA
+       select MMU_GATHER_RCU_TABLE_FREE if SMP && MMU
        select MODULES_USE_ELF_RELA if MODULES
        select MODULE_SECTIONS if MODULES
        select OF
@@ -315,7 +325,6 @@ config AS_HAS_OPTION_ARCH
        # https://reviews.llvm.org/D123515
        def_bool y
        depends on $(as-instr, .option arch$(comma) +m)
-       depends on !$(as-instr, .option arch$(comma) -i)
 
 source "arch/riscv/Kconfig.socs"
 source "arch/riscv/Kconfig.errata"
@@ -578,6 +587,13 @@ config TOOLCHAIN_HAS_ZBB
        depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900
        depends on AS_HAS_OPTION_ARCH
 
+# This symbol indicates that the toolchain supports all v1.0 vector crypto
+# extensions, including Zvk*, Zvbb, and Zvbc.  LLVM added all of these at once.
+# binutils added all except Zvkb, then added Zvkb.  So we just check for Zvkb.
+config TOOLCHAIN_HAS_VECTOR_CRYPTO
+       def_bool $(as-instr, .option arch$(comma) +v$(comma) +zvkb)
+       depends on AS_HAS_OPTION_ARCH
+
 config RISCV_ISA_ZBB
        bool "Zbb extension support for bit manipulation instructions"
        depends on TOOLCHAIN_HAS_ZBB
@@ -688,27 +704,61 @@ config THREAD_SIZE_ORDER
          affects irq stack size, which is equal to thread stack size.
 
 config RISCV_MISALIGNED
-       bool "Support misaligned load/store traps for kernel and userspace"
+       bool
        select SYSCTL_ARCH_UNALIGN_ALLOW
-       default y
        help
-         Say Y here if you want the kernel to embed support for misaligned
-         load/store for both kernel and userspace. When disable, misaligned
-         accesses will generate SIGBUS in userspace and panic in kernel.
+         Embed support for emulating misaligned loads and stores.
+
+choice
+       prompt "Unaligned Accesses Support"
+       default RISCV_PROBE_UNALIGNED_ACCESS
+       help
+         This determines the level of support for unaligned accesses. This
+         information is used by the kernel to perform optimizations. It is also
+         exposed to user space via the hwprobe syscall. The hardware will be
+         probed at boot by default.
+
+config RISCV_PROBE_UNALIGNED_ACCESS
+       bool "Probe for hardware unaligned access support"
+       select RISCV_MISALIGNED
+       help
+         During boot, the kernel will run a series of tests to determine the
+         speed of unaligned accesses. This probing will dynamically determine
+         the speed of unaligned accesses on the underlying system. If unaligned
+         memory accesses trap into the kernel as they are not supported by the
+         system, the kernel will emulate the unaligned accesses to preserve the
+         UABI.
+
+config RISCV_EMULATED_UNALIGNED_ACCESS
+       bool "Emulate unaligned access where system support is missing"
+       select RISCV_MISALIGNED
+       help
+         If unaligned memory accesses trap into the kernel as they are not
+         supported by the system, the kernel will emulate the unaligned
+         accesses to preserve the UABI. When the underlying system does support
+         unaligned accesses, the unaligned accesses are assumed to be slow.
+
+config RISCV_SLOW_UNALIGNED_ACCESS
+       bool "Assume the system supports slow unaligned memory accesses"
+       depends on NONPORTABLE
+       help
+         Assume that the system supports slow unaligned memory accesses. The
+         kernel and userspace programs may not be able to run at all on systems
+         that do not support unaligned memory accesses.
 
 config RISCV_EFFICIENT_UNALIGNED_ACCESS
-       bool "Assume the CPU supports fast unaligned memory accesses"
+       bool "Assume the system supports fast unaligned memory accesses"
        depends on NONPORTABLE
        select DCACHE_WORD_ACCESS if MMU
        select HAVE_EFFICIENT_UNALIGNED_ACCESS
        help
-         Say Y here if you want the kernel to assume that the CPU supports
-         efficient unaligned memory accesses.  When enabled, this option
-         improves the performance of the kernel on such CPUs.  However, the
-         kernel will run much more slowly, or will not be able to run at all,
-         on CPUs that do not support efficient unaligned memory accesses.
+         Assume that the system supports fast unaligned memory accesses. When
+         enabled, this option improves the performance of the kernel on such
+         systems. However, the kernel and userspace programs will run much more
+         slowly, or will not be able to run at all, on systems that do not
+         support efficient unaligned memory accesses.
 
-         If unsure what to do here, say N.
+endchoice
 
 endmenu # "Platform type"
 
@@ -1001,11 +1051,8 @@ menu "Power management options"
 
 source "kernel/power/Kconfig"
 
-# Hibernation is only possible on systems where the SBI implementation has
-# marked its reserved memory as not accessible from, or does not run
-# from the same memory as, Linux
 config ARCH_HIBERNATION_POSSIBLE
-       def_bool NONPORTABLE
+       def_bool y
 
 config ARCH_HIBERNATION_HEADER
        def_bool HIBERNATION
index 0b7d109258e7d850846bb3c5f084a0482f07d02b..252d63942f34ebe08a3087d12bee3a1c4833f15a 100644 (file)
@@ -50,6 +50,11 @@ ifndef CONFIG_AS_IS_LLVM
        KBUILD_CFLAGS += -Wa,-mno-relax
        KBUILD_AFLAGS += -Wa,-mno-relax
 endif
+# LLVM has an issue with target-features and LTO: https://github.com/llvm/llvm-project/issues/59350
+# Ensure it is aware of linker relaxation with LTO, otherwise relocations may
+# be incorrect: https://github.com/llvm/llvm-project/issues/65090
+else ifeq ($(CONFIG_LTO_CLANG),y)
+       KBUILD_LDFLAGS += -mllvm -mattr=+c -mllvm -mattr=+relax
 endif
 
 ifeq ($(CONFIG_SHADOW_CALL_STACK),y)
index a92cfcfc021b4c3847a48828a45948da169c882f..d7a66043f13b95eed4d436354df284ac9975099c 100644 (file)
@@ -27,7 +27,7 @@
                        riscv,isa-base = "rv64i";
                        riscv,isa-extensions = "i", "m", "a", "f", "d", "c",
                                               "zicntr", "zicsr", "zifencei",
-                                              "zihpm";
+                                              "zihpm", "xandespmu";
                        mmu-type = "riscv,sv39";
                        i-cache-size = <0x8000>;
                        i-cache-line-size = <0x40>;
@@ -39,7 +39,7 @@
 
                        cpu0_intc: interrupt-controller {
                                #interrupt-cells = <1>;
-                               compatible = "riscv,cpu-intc";
+                               compatible = "andestech,cpu-intc", "riscv,cpu-intc";
                                interrupt-controller;
                        };
                };
index eaf34e871e308f0db7a0a578b34940d8d551b163..fc0ec2ee13bc22c842e7ce4025e92e05599b5e1c 100644 (file)
@@ -44,6 +44,7 @@ CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_CPU_FREQ_GOV_ONDEMAND=y
 CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m
 CONFIG_CPUFREQ_DT=y
+CONFIG_ACPI_CPPC_CPUFREQ=m
 CONFIG_VIRTUALIZATION=y
 CONFIG_KVM=m
 CONFIG_ACPI=y
@@ -215,6 +216,7 @@ CONFIG_MMC=y
 CONFIG_MMC_SDHCI=y
 CONFIG_MMC_SDHCI_PLTFM=y
 CONFIG_MMC_SDHCI_CADENCE=y
+CONFIG_MMC_SDHCI_OF_DWCMSHC=y
 CONFIG_MMC_SPI=y
 CONFIG_MMC_DW=y
 CONFIG_MMC_DW_STARFIVE=y
@@ -224,6 +226,7 @@ CONFIG_RTC_CLASS=y
 CONFIG_RTC_DRV_SUN6I=y
 CONFIG_DMADEVICES=y
 CONFIG_DMA_SUN6I=m
+CONFIG_DW_AXI_DMAC=y
 CONFIG_RZ_DMAC=y
 CONFIG_VIRTIO_PCI=y
 CONFIG_VIRTIO_BALLOON=y
diff --git a/arch/riscv/crypto/Kconfig b/arch/riscv/crypto/Kconfig
new file mode 100644 (file)
index 0000000..ad58dad
--- /dev/null
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: GPL-2.0
+
+menu "Accelerated Cryptographic Algorithms for CPU (riscv)"
+
+config CRYPTO_AES_RISCV64
+       tristate "Ciphers: AES, modes: ECB, CBC, CTS, CTR, XTS"
+       depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+       select CRYPTO_ALGAPI
+       select CRYPTO_LIB_AES
+       select CRYPTO_SKCIPHER
+       help
+         Block cipher: AES cipher algorithms
+         Length-preserving ciphers: AES with ECB, CBC, CTS, CTR, XTS
+
+         Architecture: riscv64 using:
+         - Zvkned vector crypto extension
+         - Zvbb vector extension (XTS)
+         - Zvkb vector crypto extension (CTR)
+         - Zvkg vector crypto extension (XTS)
+
+config CRYPTO_CHACHA_RISCV64
+       tristate "Ciphers: ChaCha"
+       depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+       select CRYPTO_SKCIPHER
+       select CRYPTO_LIB_CHACHA_GENERIC
+       help
+         Length-preserving ciphers: ChaCha20 stream cipher algorithm
+
+         Architecture: riscv64 using:
+         - Zvkb vector crypto extension
+
+config CRYPTO_GHASH_RISCV64
+       tristate "Hash functions: GHASH"
+       depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+       select CRYPTO_GCM
+       help
+         GCM GHASH function (NIST SP 800-38D)
+
+         Architecture: riscv64 using:
+         - Zvkg vector crypto extension
+
+config CRYPTO_SHA256_RISCV64
+       tristate "Hash functions: SHA-224 and SHA-256"
+       depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+       select CRYPTO_SHA256
+       help
+         SHA-224 and SHA-256 secure hash algorithm (FIPS 180)
+
+         Architecture: riscv64 using:
+         - Zvknha or Zvknhb vector crypto extensions
+         - Zvkb vector crypto extension
+
+config CRYPTO_SHA512_RISCV64
+       tristate "Hash functions: SHA-384 and SHA-512"
+       depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+       select CRYPTO_SHA512
+       help
+         SHA-384 and SHA-512 secure hash algorithm (FIPS 180)
+
+         Architecture: riscv64 using:
+         - Zvknhb vector crypto extension
+         - Zvkb vector crypto extension
+
+config CRYPTO_SM3_RISCV64
+       tristate "Hash functions: SM3 (ShangMi 3)"
+       depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+       select CRYPTO_HASH
+       select CRYPTO_SM3
+       help
+         SM3 (ShangMi 3) secure hash function (OSCCA GM/T 0004-2012)
+
+         Architecture: riscv64 using:
+         - Zvksh vector crypto extension
+         - Zvkb vector crypto extension
+
+config CRYPTO_SM4_RISCV64
+       tristate "Ciphers: SM4 (ShangMi 4)"
+       depends on 64BIT && RISCV_ISA_V && TOOLCHAIN_HAS_VECTOR_CRYPTO
+       select CRYPTO_ALGAPI
+       select CRYPTO_SM4
+       help
+         SM4 block cipher algorithm (OSCCA GB/T 32907-2016,
+         ISO/IEC 18033-3:2010/Amd 1:2021)
+
+         SM4 (GBT.32907-2016) is a cryptographic standard issued by the
+         Organization of State Commercial Administration of China (OSCCA)
+         as an authorized cryptographic algorithm for use within China.
+
+         Architecture: riscv64 using:
+         - Zvksed vector crypto extension
+         - Zvkb vector crypto extension
+
+endmenu
diff --git a/arch/riscv/crypto/Makefile b/arch/riscv/crypto/Makefile
new file mode 100644 (file)
index 0000000..247c7bc
--- /dev/null
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_CRYPTO_AES_RISCV64) += aes-riscv64.o
+aes-riscv64-y := aes-riscv64-glue.o aes-riscv64-zvkned.o \
+                aes-riscv64-zvkned-zvbb-zvkg.o aes-riscv64-zvkned-zvkb.o
+
+obj-$(CONFIG_CRYPTO_CHACHA_RISCV64) += chacha-riscv64.o
+chacha-riscv64-y := chacha-riscv64-glue.o chacha-riscv64-zvkb.o
+
+obj-$(CONFIG_CRYPTO_GHASH_RISCV64) += ghash-riscv64.o
+ghash-riscv64-y := ghash-riscv64-glue.o ghash-riscv64-zvkg.o
+
+obj-$(CONFIG_CRYPTO_SHA256_RISCV64) += sha256-riscv64.o
+sha256-riscv64-y := sha256-riscv64-glue.o sha256-riscv64-zvknha_or_zvknhb-zvkb.o
+
+obj-$(CONFIG_CRYPTO_SHA512_RISCV64) += sha512-riscv64.o
+sha512-riscv64-y := sha512-riscv64-glue.o sha512-riscv64-zvknhb-zvkb.o
+
+obj-$(CONFIG_CRYPTO_SM3_RISCV64) += sm3-riscv64.o
+sm3-riscv64-y := sm3-riscv64-glue.o sm3-riscv64-zvksh-zvkb.o
+
+obj-$(CONFIG_CRYPTO_SM4_RISCV64) += sm4-riscv64.o
+sm4-riscv64-y := sm4-riscv64-glue.o sm4-riscv64-zvksed-zvkb.o
diff --git a/arch/riscv/crypto/aes-macros.S b/arch/riscv/crypto/aes-macros.S
new file mode 100644 (file)
index 0000000..d1a258d
--- /dev/null
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This file contains macros that are shared by the other aes-*.S files.  The
+// generated code of these macros depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector AES block cipher extension ('Zvkned')
+
+// Loads the AES round keys from \keyp into vector registers and jumps to code
+// specific to the length of the key.  Specifically:
+//   - If AES-128, loads round keys into v1-v11 and jumps to \label128.
+//   - If AES-192, loads round keys into v1-v13 and jumps to \label192.
+//   - If AES-256, loads round keys into v1-v15 and continues onwards.
+//
+// Also sets vl=4 and vtype=e32,m1,ta,ma.  Clobbers t0 and t1.
+.macro aes_begin       keyp, label128, label192
+       lwu             t0, 480(\keyp)  // t0 = key length in bytes
+       li              t1, 24          // t1 = key length for AES-192
+       vsetivli        zero, 4, e32, m1, ta, ma
+       vle32.v         v1, (\keyp)
+       addi            \keyp, \keyp, 16
+       vle32.v         v2, (\keyp)
+       addi            \keyp, \keyp, 16
+       vle32.v         v3, (\keyp)
+       addi            \keyp, \keyp, 16
+       vle32.v         v4, (\keyp)
+       addi            \keyp, \keyp, 16
+       vle32.v         v5, (\keyp)
+       addi            \keyp, \keyp, 16
+       vle32.v         v6, (\keyp)
+       addi            \keyp, \keyp, 16
+       vle32.v         v7, (\keyp)
+       addi            \keyp, \keyp, 16
+       vle32.v         v8, (\keyp)
+       addi            \keyp, \keyp, 16
+       vle32.v         v9, (\keyp)
+       addi            \keyp, \keyp, 16
+       vle32.v         v10, (\keyp)
+       addi            \keyp, \keyp, 16
+       vle32.v         v11, (\keyp)
+       blt             t0, t1, \label128       // If AES-128, goto label128.
+       addi            \keyp, \keyp, 16
+       vle32.v         v12, (\keyp)
+       addi            \keyp, \keyp, 16
+       vle32.v         v13, (\keyp)
+       beq             t0, t1, \label192       // If AES-192, goto label192.
+       // Else, it's AES-256.
+       addi            \keyp, \keyp, 16
+       vle32.v         v14, (\keyp)
+       addi            \keyp, \keyp, 16
+       vle32.v         v15, (\keyp)
+.endm
+
+// Encrypts \data using zvkned instructions, using the round keys loaded into
+// v1-v11 (for AES-128), v1-v13 (for AES-192), or v1-v15 (for AES-256).  \keylen
+// is the AES key length in bits.  vl and vtype must already be set
+// appropriately.  Note that if vl > 4, multiple blocks are encrypted.
+.macro aes_encrypt     data, keylen
+       vaesz.vs        \data, v1
+       vaesem.vs       \data, v2
+       vaesem.vs       \data, v3
+       vaesem.vs       \data, v4
+       vaesem.vs       \data, v5
+       vaesem.vs       \data, v6
+       vaesem.vs       \data, v7
+       vaesem.vs       \data, v8
+       vaesem.vs       \data, v9
+       vaesem.vs       \data, v10
+.if \keylen == 128
+       vaesef.vs       \data, v11
+.elseif \keylen == 192
+       vaesem.vs       \data, v11
+       vaesem.vs       \data, v12
+       vaesef.vs       \data, v13
+.else
+       vaesem.vs       \data, v11
+       vaesem.vs       \data, v12
+       vaesem.vs       \data, v13
+       vaesem.vs       \data, v14
+       vaesef.vs       \data, v15
+.endif
+.endm
+
+// Same as aes_encrypt, but decrypts instead of encrypts.
+.macro aes_decrypt     data, keylen
+.if \keylen == 128
+       vaesz.vs        \data, v11
+.elseif \keylen == 192
+       vaesz.vs        \data, v13
+       vaesdm.vs       \data, v12
+       vaesdm.vs       \data, v11
+.else
+       vaesz.vs        \data, v15
+       vaesdm.vs       \data, v14
+       vaesdm.vs       \data, v13
+       vaesdm.vs       \data, v12
+       vaesdm.vs       \data, v11
+.endif
+       vaesdm.vs       \data, v10
+       vaesdm.vs       \data, v9
+       vaesdm.vs       \data, v8
+       vaesdm.vs       \data, v7
+       vaesdm.vs       \data, v6
+       vaesdm.vs       \data, v5
+       vaesdm.vs       \data, v4
+       vaesdm.vs       \data, v3
+       vaesdm.vs       \data, v2
+       vaesdf.vs       \data, v1
+.endm
+
+// Expands to aes_encrypt or aes_decrypt according to \enc, which is 1 or 0.
+.macro aes_crypt       data, enc, keylen
+.if \enc
+       aes_encrypt     \data, \keylen
+.else
+       aes_decrypt     \data, \keylen
+.endif
+.endm
diff --git a/arch/riscv/crypto/aes-riscv64-glue.c b/arch/riscv/crypto/aes-riscv64-glue.c
new file mode 100644 (file)
index 0000000..f814ee0
--- /dev/null
@@ -0,0 +1,637 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AES using the RISC-V vector crypto extensions.  Includes the bare block
+ * cipher and the ECB, CBC, CBC-CTS, CTR, and XTS modes.
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ *
+ * Copyright 2024 Google LLC
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/aes.h>
+#include <crypto/internal/cipher.h>
+#include <crypto/internal/simd.h>
+#include <crypto/internal/skcipher.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/xts.h>
+#include <linux/linkage.h>
+#include <linux/module.h>
+
+asmlinkage void aes_encrypt_zvkned(const struct crypto_aes_ctx *key,
+                                  const u8 in[AES_BLOCK_SIZE],
+                                  u8 out[AES_BLOCK_SIZE]);
+asmlinkage void aes_decrypt_zvkned(const struct crypto_aes_ctx *key,
+                                  const u8 in[AES_BLOCK_SIZE],
+                                  u8 out[AES_BLOCK_SIZE]);
+
+asmlinkage void aes_ecb_encrypt_zvkned(const struct crypto_aes_ctx *key,
+                                      const u8 *in, u8 *out, size_t len);
+asmlinkage void aes_ecb_decrypt_zvkned(const struct crypto_aes_ctx *key,
+                                      const u8 *in, u8 *out, size_t len);
+
+asmlinkage void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key,
+                                      const u8 *in, u8 *out, size_t len,
+                                      u8 iv[AES_BLOCK_SIZE]);
+asmlinkage void aes_cbc_decrypt_zvkned(const struct crypto_aes_ctx *key,
+                                      const u8 *in, u8 *out, size_t len,
+                                      u8 iv[AES_BLOCK_SIZE]);
+
+asmlinkage void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,
+                                        const u8 *in, u8 *out, size_t len,
+                                        const u8 iv[AES_BLOCK_SIZE], bool enc);
+
+asmlinkage void aes_ctr32_crypt_zvkned_zvkb(const struct crypto_aes_ctx *key,
+                                           const u8 *in, u8 *out, size_t len,
+                                           u8 iv[AES_BLOCK_SIZE]);
+
+asmlinkage void aes_xts_encrypt_zvkned_zvbb_zvkg(
+                       const struct crypto_aes_ctx *key,
+                       const u8 *in, u8 *out, size_t len,
+                       u8 tweak[AES_BLOCK_SIZE]);
+
+asmlinkage void aes_xts_decrypt_zvkned_zvbb_zvkg(
+                       const struct crypto_aes_ctx *key,
+                       const u8 *in, u8 *out, size_t len,
+                       u8 tweak[AES_BLOCK_SIZE]);
+
+static int riscv64_aes_setkey(struct crypto_aes_ctx *ctx,
+                             const u8 *key, unsigned int keylen)
+{
+       /*
+        * For now we just use the generic key expansion, for these reasons:
+        *
+        * - zvkned's key expansion instructions don't support AES-192.
+        *   So, non-zvkned fallback code would be needed anyway.
+        *
+        * - Users of AES in Linux usually don't change keys frequently.
+        *   So, key expansion isn't performance-critical.
+        *
+        * - For single-block AES exposed as a "cipher" algorithm, it's
+        *   necessary to use struct crypto_aes_ctx and initialize its 'key_dec'
+        *   field with the round keys for the Equivalent Inverse Cipher.  This
+        *   is because with "cipher", decryption can be requested from a
+        *   context where the vector unit isn't usable, necessitating a
+        *   fallback to aes_decrypt().  But, zvkned can only generate and use
+        *   the normal round keys.  Of course, it's preferable to not have
+        *   special code just for "cipher", as e.g. XTS also uses a
+        *   single-block AES encryption.  It's simplest to just use
+        *   struct crypto_aes_ctx and aes_expandkey() everywhere.
+        */
+       return aes_expandkey(ctx, key, keylen);
+}
+
+static int riscv64_aes_setkey_cipher(struct crypto_tfm *tfm,
+                                    const u8 *key, unsigned int keylen)
+{
+       struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       return riscv64_aes_setkey(ctx, key, keylen);
+}
+
+static int riscv64_aes_setkey_skcipher(struct crypto_skcipher *tfm,
+                                      const u8 *key, unsigned int keylen)
+{
+       struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+       return riscv64_aes_setkey(ctx, key, keylen);
+}
+
+/* Bare AES, without a mode of operation */
+
+static void riscv64_aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+       const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       if (crypto_simd_usable()) {
+               kernel_vector_begin();
+               aes_encrypt_zvkned(ctx, src, dst);
+               kernel_vector_end();
+       } else {
+               aes_encrypt(ctx, dst, src);
+       }
+}
+
+static void riscv64_aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+       const struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       if (crypto_simd_usable()) {
+               kernel_vector_begin();
+               aes_decrypt_zvkned(ctx, src, dst);
+               kernel_vector_end();
+       } else {
+               aes_decrypt(ctx, dst, src);
+       }
+}
+
+/* AES-ECB */
+
+static inline int riscv64_aes_ecb_crypt(struct skcipher_request *req, bool enc)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       unsigned int nbytes;
+       int err;
+
+       err = skcipher_walk_virt(&walk, req, false);
+       while ((nbytes = walk.nbytes) != 0) {
+               kernel_vector_begin();
+               if (enc)
+                       aes_ecb_encrypt_zvkned(ctx, walk.src.virt.addr,
+                                              walk.dst.virt.addr,
+                                              nbytes & ~(AES_BLOCK_SIZE - 1));
+               else
+                       aes_ecb_decrypt_zvkned(ctx, walk.src.virt.addr,
+                                              walk.dst.virt.addr,
+                                              nbytes & ~(AES_BLOCK_SIZE - 1));
+               kernel_vector_end();
+               err = skcipher_walk_done(&walk, nbytes & (AES_BLOCK_SIZE - 1));
+       }
+
+       return err;
+}
+
+static int riscv64_aes_ecb_encrypt(struct skcipher_request *req)
+{
+       return riscv64_aes_ecb_crypt(req, true);
+}
+
+static int riscv64_aes_ecb_decrypt(struct skcipher_request *req)
+{
+       return riscv64_aes_ecb_crypt(req, false);
+}
+
+/* AES-CBC */
+
+static int riscv64_aes_cbc_crypt(struct skcipher_request *req, bool enc)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       unsigned int nbytes;
+       int err;
+
+       err = skcipher_walk_virt(&walk, req, false);
+       while ((nbytes = walk.nbytes) != 0) {
+               kernel_vector_begin();
+               if (enc)
+                       aes_cbc_encrypt_zvkned(ctx, walk.src.virt.addr,
+                                              walk.dst.virt.addr,
+                                              nbytes & ~(AES_BLOCK_SIZE - 1),
+                                              walk.iv);
+               else
+                       aes_cbc_decrypt_zvkned(ctx, walk.src.virt.addr,
+                                              walk.dst.virt.addr,
+                                              nbytes & ~(AES_BLOCK_SIZE - 1),
+                                              walk.iv);
+               kernel_vector_end();
+               err = skcipher_walk_done(&walk, nbytes & (AES_BLOCK_SIZE - 1));
+       }
+
+       return err;
+}
+
+static int riscv64_aes_cbc_encrypt(struct skcipher_request *req)
+{
+       return riscv64_aes_cbc_crypt(req, true);
+}
+
+static int riscv64_aes_cbc_decrypt(struct skcipher_request *req)
+{
+       return riscv64_aes_cbc_crypt(req, false);
+}
+
+/* AES-CBC-CTS */
+
+static int riscv64_aes_cbc_cts_crypt(struct skcipher_request *req, bool enc)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct scatterlist sg_src[2], sg_dst[2];
+       struct skcipher_request subreq;
+       struct scatterlist *src, *dst;
+       struct skcipher_walk walk;
+       unsigned int cbc_len;
+       int err;
+
+       if (req->cryptlen < AES_BLOCK_SIZE)
+               return -EINVAL;
+
+       err = skcipher_walk_virt(&walk, req, false);
+       if (err)
+               return err;
+       /*
+        * If the full message is available in one step, decrypt it in one call
+        * to the CBC-CTS assembly function.  This reduces overhead, especially
+        * on short messages.  Otherwise, fall back to doing CBC up to the last
+        * two blocks, then invoke CTS just for the ciphertext stealing.
+        */
+       if (unlikely(walk.nbytes != req->cryptlen)) {
+               cbc_len = round_down(req->cryptlen - AES_BLOCK_SIZE - 1,
+                                    AES_BLOCK_SIZE);
+               skcipher_walk_abort(&walk);
+               skcipher_request_set_tfm(&subreq, tfm);
+               skcipher_request_set_callback(&subreq,
+                                             skcipher_request_flags(req),
+                                             NULL, NULL);
+               skcipher_request_set_crypt(&subreq, req->src, req->dst,
+                                          cbc_len, req->iv);
+               err = riscv64_aes_cbc_crypt(&subreq, enc);
+               if (err)
+                       return err;
+               dst = src = scatterwalk_ffwd(sg_src, req->src, cbc_len);
+               if (req->dst != req->src)
+                       dst = scatterwalk_ffwd(sg_dst, req->dst, cbc_len);
+               skcipher_request_set_crypt(&subreq, src, dst,
+                                          req->cryptlen - cbc_len, req->iv);
+               err = skcipher_walk_virt(&walk, &subreq, false);
+               if (err)
+                       return err;
+       }
+       kernel_vector_begin();
+       aes_cbc_cts_crypt_zvkned(ctx, walk.src.virt.addr, walk.dst.virt.addr,
+                                walk.nbytes, req->iv, enc);
+       kernel_vector_end();
+       return skcipher_walk_done(&walk, 0);
+}
+
+static int riscv64_aes_cbc_cts_encrypt(struct skcipher_request *req)
+{
+       return riscv64_aes_cbc_cts_crypt(req, true);
+}
+
+static int riscv64_aes_cbc_cts_decrypt(struct skcipher_request *req)
+{
+       return riscv64_aes_cbc_cts_crypt(req, false);
+}
+
+/* AES-CTR */
+
+static int riscv64_aes_ctr_crypt(struct skcipher_request *req)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       const struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm);
+       unsigned int nbytes, p1_nbytes;
+       struct skcipher_walk walk;
+       u32 ctr32, nblocks;
+       int err;
+
+       /* Get the low 32-bit word of the 128-bit big endian counter. */
+       ctr32 = get_unaligned_be32(req->iv + 12);
+
+       err = skcipher_walk_virt(&walk, req, false);
+       while ((nbytes = walk.nbytes) != 0) {
+               if (nbytes < walk.total) {
+                       /* Not the end yet, so keep the length block-aligned. */
+                       nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+                       nblocks = nbytes / AES_BLOCK_SIZE;
+               } else {
+                       /* It's the end, so include any final partial block. */
+                       nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE);
+               }
+               ctr32 += nblocks;
+
+               kernel_vector_begin();
+               if (ctr32 >= nblocks) {
+                       /* The low 32-bit word of the counter won't overflow. */
+                       aes_ctr32_crypt_zvkned_zvkb(ctx, walk.src.virt.addr,
+                                                   walk.dst.virt.addr, nbytes,
+                                                   req->iv);
+               } else {
+                       /*
+                        * The low 32-bit word of the counter will overflow.
+                        * The assembly doesn't handle this case, so split the
+                        * operation into two at the point where the overflow
+                        * will occur.  After the first part, add the carry bit.
+                        */
+                       p1_nbytes = min_t(unsigned int, nbytes,
+                                         (nblocks - ctr32) * AES_BLOCK_SIZE);
+                       aes_ctr32_crypt_zvkned_zvkb(ctx, walk.src.virt.addr,
+                                                   walk.dst.virt.addr,
+                                                   p1_nbytes, req->iv);
+                       crypto_inc(req->iv, 12);
+
+                       if (ctr32) {
+                               aes_ctr32_crypt_zvkned_zvkb(
+                                       ctx,
+                                       walk.src.virt.addr + p1_nbytes,
+                                       walk.dst.virt.addr + p1_nbytes,
+                                       nbytes - p1_nbytes, req->iv);
+                       }
+               }
+               kernel_vector_end();
+
+               err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+       }
+
+       return err;
+}
+
+/* AES-XTS */
+
+struct riscv64_aes_xts_ctx {
+       struct crypto_aes_ctx ctx1;
+       struct crypto_aes_ctx ctx2;
+};
+
+static int riscv64_aes_xts_setkey(struct crypto_skcipher *tfm, const u8 *key,
+                                 unsigned int keylen)
+{
+       struct riscv64_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+
+       return xts_verify_key(tfm, key, keylen) ?:
+              riscv64_aes_setkey(&ctx->ctx1, key, keylen / 2) ?:
+              riscv64_aes_setkey(&ctx->ctx2, key + keylen / 2, keylen / 2);
+}
+
+static int riscv64_aes_xts_crypt(struct skcipher_request *req, bool enc)
+{
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       const struct riscv64_aes_xts_ctx *ctx = crypto_skcipher_ctx(tfm);
+       int tail = req->cryptlen % AES_BLOCK_SIZE;
+       struct scatterlist sg_src[2], sg_dst[2];
+       struct skcipher_request subreq;
+       struct scatterlist *src, *dst;
+       struct skcipher_walk walk;
+       int err;
+
+       if (req->cryptlen < AES_BLOCK_SIZE)
+               return -EINVAL;
+
+       /* Encrypt the IV with the tweak key to get the first tweak. */
+       kernel_vector_begin();
+       aes_encrypt_zvkned(&ctx->ctx2, req->iv, req->iv);
+       kernel_vector_end();
+
+       err = skcipher_walk_virt(&walk, req, false);
+
+       /*
+        * If the message length isn't divisible by the AES block size and the
+        * full message isn't available in one step of the scatterlist walk,
+        * then separate off the last full block and the partial block.  This
+        * ensures that they are processed in the same call to the assembly
+        * function, which is required for ciphertext stealing.
+        */
+       if (unlikely(tail > 0 && walk.nbytes < walk.total)) {
+               skcipher_walk_abort(&walk);
+
+               skcipher_request_set_tfm(&subreq, tfm);
+               skcipher_request_set_callback(&subreq,
+                                             skcipher_request_flags(req),
+                                             NULL, NULL);
+               skcipher_request_set_crypt(&subreq, req->src, req->dst,
+                                          req->cryptlen - tail - AES_BLOCK_SIZE,
+                                          req->iv);
+               req = &subreq;
+               err = skcipher_walk_virt(&walk, req, false);
+       } else {
+               tail = 0;
+       }
+
+       while (walk.nbytes) {
+               unsigned int nbytes = walk.nbytes;
+
+               if (nbytes < walk.total)
+                       nbytes = round_down(nbytes, AES_BLOCK_SIZE);
+
+               kernel_vector_begin();
+               if (enc)
+                       aes_xts_encrypt_zvkned_zvbb_zvkg(
+                               &ctx->ctx1, walk.src.virt.addr,
+                               walk.dst.virt.addr, nbytes, req->iv);
+               else
+                       aes_xts_decrypt_zvkned_zvbb_zvkg(
+                               &ctx->ctx1, walk.src.virt.addr,
+                               walk.dst.virt.addr, nbytes, req->iv);
+               kernel_vector_end();
+               err = skcipher_walk_done(&walk, walk.nbytes - nbytes);
+       }
+
+       if (err || likely(!tail))
+               return err;
+
+       /* Do ciphertext stealing with the last full block and partial block. */
+
+       dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen);
+       if (req->dst != req->src)
+               dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen);
+
+       skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail,
+                                  req->iv);
+
+       err = skcipher_walk_virt(&walk, req, false);
+       if (err)
+               return err;
+
+       kernel_vector_begin();
+       if (enc)
+               aes_xts_encrypt_zvkned_zvbb_zvkg(
+                       &ctx->ctx1, walk.src.virt.addr,
+                       walk.dst.virt.addr, walk.nbytes, req->iv);
+       else
+               aes_xts_decrypt_zvkned_zvbb_zvkg(
+                       &ctx->ctx1, walk.src.virt.addr,
+                       walk.dst.virt.addr, walk.nbytes, req->iv);
+       kernel_vector_end();
+
+       return skcipher_walk_done(&walk, 0);
+}
+
+static int riscv64_aes_xts_encrypt(struct skcipher_request *req)
+{
+       return riscv64_aes_xts_crypt(req, true);
+}
+
+static int riscv64_aes_xts_decrypt(struct skcipher_request *req)
+{
+       return riscv64_aes_xts_crypt(req, false);
+}
+
+/* Algorithm definitions */
+
+static struct crypto_alg riscv64_zvkned_aes_cipher_alg = {
+       .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+       .cra_blocksize = AES_BLOCK_SIZE,
+       .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+       .cra_priority = 300,
+       .cra_name = "aes",
+       .cra_driver_name = "aes-riscv64-zvkned",
+       .cra_cipher = {
+               .cia_min_keysize = AES_MIN_KEY_SIZE,
+               .cia_max_keysize = AES_MAX_KEY_SIZE,
+               .cia_setkey = riscv64_aes_setkey_cipher,
+               .cia_encrypt = riscv64_aes_encrypt,
+               .cia_decrypt = riscv64_aes_decrypt,
+       },
+       .cra_module = THIS_MODULE,
+};
+
+static struct skcipher_alg riscv64_zvkned_aes_skcipher_algs[] = {
+       {
+               .setkey = riscv64_aes_setkey_skcipher,
+               .encrypt = riscv64_aes_ecb_encrypt,
+               .decrypt = riscv64_aes_ecb_decrypt,
+               .min_keysize = AES_MIN_KEY_SIZE,
+               .max_keysize = AES_MAX_KEY_SIZE,
+               .walksize = 8 * AES_BLOCK_SIZE, /* matches LMUL=8 */
+               .base = {
+                       .cra_blocksize = AES_BLOCK_SIZE,
+                       .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+                       .cra_priority = 300,
+                       .cra_name = "ecb(aes)",
+                       .cra_driver_name = "ecb-aes-riscv64-zvkned",
+                       .cra_module = THIS_MODULE,
+               },
+       }, {
+               .setkey = riscv64_aes_setkey_skcipher,
+               .encrypt = riscv64_aes_cbc_encrypt,
+               .decrypt = riscv64_aes_cbc_decrypt,
+               .min_keysize = AES_MIN_KEY_SIZE,
+               .max_keysize = AES_MAX_KEY_SIZE,
+               .ivsize = AES_BLOCK_SIZE,
+               .base = {
+                       .cra_blocksize = AES_BLOCK_SIZE,
+                       .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+                       .cra_priority = 300,
+                       .cra_name = "cbc(aes)",
+                       .cra_driver_name = "cbc-aes-riscv64-zvkned",
+                       .cra_module = THIS_MODULE,
+               },
+       }, {
+               .setkey = riscv64_aes_setkey_skcipher,
+               .encrypt = riscv64_aes_cbc_cts_encrypt,
+               .decrypt = riscv64_aes_cbc_cts_decrypt,
+               .min_keysize = AES_MIN_KEY_SIZE,
+               .max_keysize = AES_MAX_KEY_SIZE,
+               .ivsize = AES_BLOCK_SIZE,
+               .walksize = 4 * AES_BLOCK_SIZE, /* matches LMUL=4 */
+               .base = {
+                       .cra_blocksize = AES_BLOCK_SIZE,
+                       .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+                       .cra_priority = 300,
+                       .cra_name = "cts(cbc(aes))",
+                       .cra_driver_name = "cts-cbc-aes-riscv64-zvkned",
+                       .cra_module = THIS_MODULE,
+               },
+       }
+};
+
+static struct skcipher_alg riscv64_zvkned_zvkb_aes_skcipher_alg = {
+       .setkey = riscv64_aes_setkey_skcipher,
+       .encrypt = riscv64_aes_ctr_crypt,
+       .decrypt = riscv64_aes_ctr_crypt,
+       .min_keysize = AES_MIN_KEY_SIZE,
+       .max_keysize = AES_MAX_KEY_SIZE,
+       .ivsize = AES_BLOCK_SIZE,
+       .chunksize = AES_BLOCK_SIZE,
+       .walksize = 4 * AES_BLOCK_SIZE, /* matches LMUL=4 */
+       .base = {
+               .cra_blocksize = 1,
+               .cra_ctxsize = sizeof(struct crypto_aes_ctx),
+               .cra_priority = 300,
+               .cra_name = "ctr(aes)",
+               .cra_driver_name = "ctr-aes-riscv64-zvkned-zvkb",
+               .cra_module = THIS_MODULE,
+       },
+};
+
+static struct skcipher_alg riscv64_zvkned_zvbb_zvkg_aes_skcipher_alg = {
+       .setkey = riscv64_aes_xts_setkey,
+       .encrypt = riscv64_aes_xts_encrypt,
+       .decrypt = riscv64_aes_xts_decrypt,
+       .min_keysize = 2 * AES_MIN_KEY_SIZE,
+       .max_keysize = 2 * AES_MAX_KEY_SIZE,
+       .ivsize = AES_BLOCK_SIZE,
+       .chunksize = AES_BLOCK_SIZE,
+       .walksize = 4 * AES_BLOCK_SIZE, /* matches LMUL=4 */
+       .base = {
+               .cra_blocksize = AES_BLOCK_SIZE,
+               .cra_ctxsize = sizeof(struct riscv64_aes_xts_ctx),
+               .cra_priority = 300,
+               .cra_name = "xts(aes)",
+               .cra_driver_name = "xts-aes-riscv64-zvkned-zvbb-zvkg",
+               .cra_module = THIS_MODULE,
+       },
+};
+
+static inline bool riscv64_aes_xts_supported(void)
+{
+       return riscv_isa_extension_available(NULL, ZVBB) &&
+              riscv_isa_extension_available(NULL, ZVKG) &&
+              riscv_vector_vlen() < 2048 /* Implementation limitation */;
+}
+
+static int __init riscv64_aes_mod_init(void)
+{
+       int err = -ENODEV;
+
+       if (riscv_isa_extension_available(NULL, ZVKNED) &&
+           riscv_vector_vlen() >= 128) {
+               err = crypto_register_alg(&riscv64_zvkned_aes_cipher_alg);
+               if (err)
+                       return err;
+
+               err = crypto_register_skciphers(
+                       riscv64_zvkned_aes_skcipher_algs,
+                       ARRAY_SIZE(riscv64_zvkned_aes_skcipher_algs));
+               if (err)
+                       goto unregister_zvkned_cipher_alg;
+
+               if (riscv_isa_extension_available(NULL, ZVKB)) {
+                       err = crypto_register_skcipher(
+                               &riscv64_zvkned_zvkb_aes_skcipher_alg);
+                       if (err)
+                               goto unregister_zvkned_skcipher_algs;
+               }
+
+               if (riscv64_aes_xts_supported()) {
+                       err = crypto_register_skcipher(
+                               &riscv64_zvkned_zvbb_zvkg_aes_skcipher_alg);
+                       if (err)
+                               goto unregister_zvkned_zvkb_skcipher_alg;
+               }
+       }
+
+       return err;
+
+unregister_zvkned_zvkb_skcipher_alg:
+       if (riscv_isa_extension_available(NULL, ZVKB))
+               crypto_unregister_skcipher(&riscv64_zvkned_zvkb_aes_skcipher_alg);
+unregister_zvkned_skcipher_algs:
+       crypto_unregister_skciphers(riscv64_zvkned_aes_skcipher_algs,
+                                   ARRAY_SIZE(riscv64_zvkned_aes_skcipher_algs));
+unregister_zvkned_cipher_alg:
+       crypto_unregister_alg(&riscv64_zvkned_aes_cipher_alg);
+       return err;
+}
+
+static void __exit riscv64_aes_mod_exit(void)
+{
+       if (riscv64_aes_xts_supported())
+               crypto_unregister_skcipher(&riscv64_zvkned_zvbb_zvkg_aes_skcipher_alg);
+       if (riscv_isa_extension_available(NULL, ZVKB))
+               crypto_unregister_skcipher(&riscv64_zvkned_zvkb_aes_skcipher_alg);
+       crypto_unregister_skciphers(riscv64_zvkned_aes_skcipher_algs,
+                                   ARRAY_SIZE(riscv64_zvkned_aes_skcipher_algs));
+       crypto_unregister_alg(&riscv64_zvkned_aes_cipher_alg);
+}
+
+module_init(riscv64_aes_mod_init);
+module_exit(riscv64_aes_mod_exit);
+
+MODULE_DESCRIPTION("AES-ECB/CBC/CTS/CTR/XTS (RISC-V accelerated)");
+MODULE_AUTHOR("Jerry Shih <jerry.shih@sifive.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("aes");
+MODULE_ALIAS_CRYPTO("ecb(aes)");
+MODULE_ALIAS_CRYPTO("cbc(aes)");
+MODULE_ALIAS_CRYPTO("cts(cbc(aes))");
+MODULE_ALIAS_CRYPTO("ctr(aes)");
+MODULE_ALIAS_CRYPTO("xts(aes)");
diff --git a/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S b/arch/riscv/crypto/aes-riscv64-zvkned-zvbb-zvkg.S
new file mode 100644 (file)
index 0000000..146fc9c
--- /dev/null
@@ -0,0 +1,312 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128 && VLEN < 2048
+// - RISC-V Vector AES block cipher extension ('Zvkned')
+// - RISC-V Vector Bit-manipulation extension ('Zvbb')
+// - RISC-V Vector GCM/GMAC extension ('Zvkg')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvkned, +zvbb, +zvkg
+
+#include "aes-macros.S"
+
+#define KEYP           a0
+#define INP            a1
+#define OUTP           a2
+#define LEN            a3
+#define TWEAKP         a4
+
+#define LEN32          a5
+#define TAIL_LEN       a6
+#define VL             a7
+#define VLMAX          t4
+
+// v1-v15 contain the AES round keys, but they are used for temporaries before
+// the AES round keys have been loaded.
+#define TWEAKS         v16     // LMUL=4 (most of the time)
+#define TWEAKS_BREV    v20     // LMUL=4 (most of the time)
+#define MULTS_BREV     v24     // LMUL=4 (most of the time)
+#define TMP0           v28
+#define TMP1           v29
+#define TMP2           v30
+#define TMP3           v31
+
+// xts_init initializes the following values:
+//
+//     TWEAKS: N 128-bit tweaks T*(x^i) for i in 0..(N - 1)
+//     TWEAKS_BREV: same as TWEAKS, but bit-reversed
+//     MULTS_BREV: N 128-bit values x^N, bit-reversed.  Only if N > 1.
+//
+// N is the maximum number of blocks that will be processed per loop iteration,
+// computed using vsetvli.
+//
+// The field convention used by XTS is the same as that of GHASH, but with the
+// bits reversed within each byte.  The zvkg extension provides the vgmul
+// instruction which does multiplication in this field.  Therefore, for tweak
+// computation we use vgmul to do multiplications in parallel, instead of
+// serially multiplying by x using shifting+xoring.  Note that for this to work,
+// the inputs and outputs to vgmul must be bit-reversed (we do it with vbrev8).
+.macro xts_init
+
+       // Load the first tweak T.
+       vsetivli        zero, 4, e32, m1, ta, ma
+       vle32.v         TWEAKS, (TWEAKP)
+
+       // If there's only one block (or no blocks at all), then skip the tweak
+       // sequence computation because (at most) T itself is needed.
+       li              t0, 16
+       ble             LEN, t0, .Linit_single_block\@
+
+       // Save a copy of T bit-reversed in v12.
+       vbrev8.v        v12, TWEAKS
+
+       //
+       // Generate x^i for i in 0..(N - 1), i.e. 128-bit values 1 << i assuming
+       // that N <= 128.  Though, this code actually requires N < 64 (or
+       // equivalently VLEN < 2048) due to the use of 64-bit intermediate
+       // values here and in the x^N computation later.
+       //
+       vsetvli         VL, LEN32, e32, m4, ta, ma
+       srli            t0, VL, 2       // t0 = N (num blocks)
+       // Generate two sequences, each with N 32-bit values:
+       // v0=[1, 1, 1, ...] and v1=[0, 1, 2, ...].
+       vsetvli         zero, t0, e32, m1, ta, ma
+       vmv.v.i         v0, 1
+       vid.v           v1
+       // Use vzext to zero-extend the sequences to 64 bits.  Reinterpret them
+       // as two sequences, each with 2*N 32-bit values:
+       // v2=[1, 0, 1, 0, 1, 0, ...] and v4=[0, 0, 1, 0, 2, 0, ...].
+       vsetvli         zero, t0, e64, m2, ta, ma
+       vzext.vf2       v2, v0
+       vzext.vf2       v4, v1
+       slli            t1, t0, 1       // t1 = 2*N
+       vsetvli         zero, t1, e32, m2, ta, ma
+       // Use vwsll to compute [1<<0, 0<<0, 1<<1, 0<<0, 1<<2, 0<<0, ...],
+       // widening to 64 bits per element.  When reinterpreted as N 128-bit
+       // values, this is the needed sequence of 128-bit values 1 << i (x^i).
+       vwsll.vv        v8, v2, v4
+
+       // Copy the bit-reversed T to all N elements of TWEAKS_BREV, then
+       // multiply by x^i.  This gives the sequence T*(x^i), bit-reversed.
+       vsetvli         zero, LEN32, e32, m4, ta, ma
+       vmv.v.i         TWEAKS_BREV, 0
+       vaesz.vs        TWEAKS_BREV, v12
+       vbrev8.v        v8, v8
+       vgmul.vv        TWEAKS_BREV, v8
+
+       // Save a copy of the sequence T*(x^i) with the bit reversal undone.
+       vbrev8.v        TWEAKS, TWEAKS_BREV
+
+       // Generate N copies of x^N, i.e. 128-bit values 1 << N, bit-reversed.
+       li              t1, 1
+       sll             t1, t1, t0      // t1 = 1 << N
+       vsetivli        zero, 2, e64, m1, ta, ma
+       vmv.v.i         v0, 0
+       vsetivli        zero, 1, e64, m1, tu, ma
+       vmv.v.x         v0, t1
+       vbrev8.v        v0, v0
+       vsetvli         zero, LEN32, e32, m4, ta, ma
+       vmv.v.i         MULTS_BREV, 0
+       vaesz.vs        MULTS_BREV, v0
+
+       j               .Linit_done\@
+
+.Linit_single_block\@:
+       vbrev8.v        TWEAKS_BREV, TWEAKS
+.Linit_done\@:
+.endm
+
+// Set the first 128 bits of MULTS_BREV to 0x40, i.e. 'x' bit-reversed.  This is
+// the multiplier required to advance the tweak by one.
+.macro load_x
+       li              t0, 0x40
+       vsetivli        zero, 4, e32, m1, ta, ma
+       vmv.v.i         MULTS_BREV, 0
+       vsetivli        zero, 1, e8, m1, tu, ma
+       vmv.v.x         MULTS_BREV, t0
+.endm
+
+.macro __aes_xts_crypt enc, keylen
+       // With 16 < len <= 31, there's no main loop, just ciphertext stealing.
+       beqz            LEN32, .Lcts_without_main_loop\@
+
+       vsetvli         VLMAX, zero, e32, m4, ta, ma
+1:
+       vsetvli         VL, LEN32, e32, m4, ta, ma
+2:
+       // Encrypt or decrypt VL/4 blocks.
+       vle32.v         TMP0, (INP)
+       vxor.vv         TMP0, TMP0, TWEAKS
+       aes_crypt       TMP0, \enc, \keylen
+       vxor.vv         TMP0, TMP0, TWEAKS
+       vse32.v         TMP0, (OUTP)
+
+       // Update the pointers and the remaining length.
+       slli            t0, VL, 2
+       add             INP, INP, t0
+       add             OUTP, OUTP, t0
+       sub             LEN32, LEN32, VL
+
+       // Check whether more blocks remain.
+       beqz            LEN32, .Lmain_loop_done\@
+
+       // Compute the next sequence of tweaks by multiplying the previous
+       // sequence by x^N.  Store the result in both bit-reversed order and
+       // regular order (i.e. with the bit reversal undone).
+       vgmul.vv        TWEAKS_BREV, MULTS_BREV
+       vbrev8.v        TWEAKS, TWEAKS_BREV
+
+       // Since we compute the tweak multipliers x^N in advance, we require
+       // that each iteration process the same length except possibly the last.
+       // This conflicts slightly with the behavior allowed by RISC-V Vector
+       // Extension, where CPUs can select a lower length for both of the last
+       // two iterations.  E.g., vl might take the sequence of values
+       // [16, 16, 16, 12, 12], whereas we need [16, 16, 16, 16, 8] so that we
+       // can use x^4 again instead of computing x^3.  Therefore, we explicitly
+       // keep the vl at VLMAX if there is at least VLMAX remaining.
+       bge             LEN32, VLMAX, 2b
+       j               1b
+
+.Lmain_loop_done\@:
+       load_x
+
+       // Compute the next tweak.
+       addi            t0, VL, -4
+       vsetivli        zero, 4, e32, m4, ta, ma
+       vslidedown.vx   TWEAKS_BREV, TWEAKS_BREV, t0    // Extract last tweak
+       vsetivli        zero, 4, e32, m1, ta, ma
+       vgmul.vv        TWEAKS_BREV, MULTS_BREV         // Advance to next tweak
+
+       bnez            TAIL_LEN, .Lcts\@
+
+       // Update *TWEAKP to contain the next tweak.
+       vbrev8.v        TWEAKS, TWEAKS_BREV
+       vse32.v         TWEAKS, (TWEAKP)
+       ret
+
+.Lcts_without_main_loop\@:
+       load_x
+.Lcts\@:
+       // TWEAKS_BREV now contains the next tweak.  Compute the one after that.
+       vsetivli        zero, 4, e32, m1, ta, ma
+       vmv.v.v         TMP0, TWEAKS_BREV
+       vgmul.vv        TMP0, MULTS_BREV
+       // Undo the bit reversal of the next two tweaks and store them in TMP1
+       // and TMP2, such that TMP1 is the first needed and TMP2 the second.
+.if \enc
+       vbrev8.v        TMP1, TWEAKS_BREV
+       vbrev8.v        TMP2, TMP0
+.else
+       vbrev8.v        TMP1, TMP0
+       vbrev8.v        TMP2, TWEAKS_BREV
+.endif
+
+       // Encrypt/decrypt the last full block.
+       vle32.v         TMP0, (INP)
+       vxor.vv         TMP0, TMP0, TMP1
+       aes_crypt       TMP0, \enc, \keylen
+       vxor.vv         TMP0, TMP0, TMP1
+
+       // Swap the first TAIL_LEN bytes of the above result with the tail.
+       // Note that to support in-place encryption/decryption, the load from
+       // the input tail must happen before the store to the output tail.
+       addi            t0, INP, 16
+       addi            t1, OUTP, 16
+       vmv.v.v         TMP3, TMP0
+       vsetvli         zero, TAIL_LEN, e8, m1, tu, ma
+       vle8.v          TMP0, (t0)
+       vse8.v          TMP3, (t1)
+
+       // Encrypt/decrypt again and store the last full block.
+       vsetivli        zero, 4, e32, m1, ta, ma
+       vxor.vv         TMP0, TMP0, TMP2
+       aes_crypt       TMP0, \enc, \keylen
+       vxor.vv         TMP0, TMP0, TMP2
+       vse32.v         TMP0, (OUTP)
+
+       ret
+.endm
+
+.macro aes_xts_crypt   enc
+
+       // Check whether the length is a multiple of the AES block size.
+       andi            TAIL_LEN, LEN, 15
+       beqz            TAIL_LEN, 1f
+
+       // The length isn't a multiple of the AES block size, so ciphertext
+       // stealing will be required.  Ciphertext stealing involves special
+       // handling of the partial block and the last full block, so subtract
+       // the length of both from the length to be processed in the main loop.
+       sub             LEN, LEN, TAIL_LEN
+       addi            LEN, LEN, -16
+1:
+       srli            LEN32, LEN, 2
+       // LEN and LEN32 now contain the total length of the blocks that will be
+       // processed in the main loop, in bytes and 32-bit words respectively.
+
+       xts_init
+       aes_begin       KEYP, 128f, 192f
+       __aes_xts_crypt \enc, 256
+128:
+       __aes_xts_crypt \enc, 128
+192:
+       __aes_xts_crypt \enc, 192
+.endm
+
+// void aes_xts_encrypt_zvkned_zvbb_zvkg(const struct crypto_aes_ctx *key,
+//                                      const u8 *in, u8 *out, size_t len,
+//                                      u8 tweak[16]);
+//
+// |key| is the data key.  |tweak| contains the next tweak; the encryption of
+// the original IV with the tweak key was already done.  This function supports
+// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and
+// |len| must be a multiple of 16 except on the last call.  If |len| is a
+// multiple of 16, then this function updates |tweak| to contain the next tweak.
+SYM_FUNC_START(aes_xts_encrypt_zvkned_zvbb_zvkg)
+       aes_xts_crypt   1
+SYM_FUNC_END(aes_xts_encrypt_zvkned_zvbb_zvkg)
+
+// Same prototype and calling convention as the encryption function
+SYM_FUNC_START(aes_xts_decrypt_zvkned_zvbb_zvkg)
+       aes_xts_crypt   0
+SYM_FUNC_END(aes_xts_decrypt_zvkned_zvbb_zvkg)
diff --git a/arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S b/arch/riscv/crypto/aes-riscv64-zvkned-zvkb.S
new file mode 100644 (file)
index 0000000..9962d45
--- /dev/null
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector AES block cipher extension ('Zvkned')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvkned, +zvkb
+
+#include "aes-macros.S"
+
+#define KEYP           a0
+#define INP            a1
+#define OUTP           a2
+#define LEN            a3
+#define IVP            a4
+
+#define LEN32          a5
+#define VL_E32         a6
+#define VL_BLOCKS      a7
+
+.macro aes_ctr32_crypt keylen
+       // LEN32 = number of blocks, rounded up, in 32-bit words.
+       addi            t0, LEN, 15
+       srli            t0, t0, 4
+       slli            LEN32, t0, 2
+
+       // Create a mask that selects the last 32-bit word of each 128-bit
+       // block.  This is the word that contains the (big-endian) counter.
+       li              t0, 0x88
+       vsetvli         t1, zero, e8, m1, ta, ma
+       vmv.v.x         v0, t0
+
+       // Load the IV into v31.  The last 32-bit word contains the counter.
+       vsetivli        zero, 4, e32, m1, ta, ma
+       vle32.v         v31, (IVP)
+
+       // Convert the big-endian counter into little-endian.
+       vsetivli        zero, 4, e32, m1, ta, mu
+       vrev8.v         v31, v31, v0.t
+
+       // Splat the IV to v16 (with LMUL=4).  The number of copies is the
+       // maximum number of blocks that will be processed per iteration.
+       vsetvli         zero, LEN32, e32, m4, ta, ma
+       vmv.v.i         v16, 0
+       vaesz.vs        v16, v31
+
+       // v20 = [x, x, x, 0, x, x, x, 1, ...]
+       viota.m         v20, v0, v0.t
+       // v16 = [IV0, IV1, IV2, counter+0, IV0, IV1, IV2, counter+1, ...]
+       vsetvli         VL_E32, LEN32, e32, m4, ta, mu
+       vadd.vv         v16, v16, v20, v0.t
+
+       j 2f
+1:
+       // Set the number of blocks to process in this iteration.  vl=VL_E32 is
+       // the length in 32-bit words, i.e. 4 times the number of blocks.
+       vsetvli         VL_E32, LEN32, e32, m4, ta, mu
+
+       // Increment the counters by the number of blocks processed in the
+       // previous iteration.
+       vadd.vx         v16, v16, VL_BLOCKS, v0.t
+2:
+       // Prepare the AES inputs into v24.
+       vmv.v.v         v24, v16
+       vrev8.v         v24, v24, v0.t  // Convert counters back to big-endian.
+
+       // Encrypt the AES inputs to create the next portion of the keystream.
+       aes_encrypt     v24, \keylen
+
+       // XOR the data with the keystream.
+       vsetvli         t0, LEN, e8, m4, ta, ma
+       vle8.v          v20, (INP)
+       vxor.vv         v20, v20, v24
+       vse8.v          v20, (OUTP)
+
+       // Advance the pointers and update the remaining length.
+       add             INP, INP, t0
+       add             OUTP, OUTP, t0
+       sub             LEN, LEN, t0
+       sub             LEN32, LEN32, VL_E32
+       srli            VL_BLOCKS, VL_E32, 2
+
+       // Repeat if more data remains.
+       bnez            LEN, 1b
+
+       // Update *IVP to contain the next counter.
+       vsetivli        zero, 4, e32, m1, ta, mu
+       vadd.vx         v16, v16, VL_BLOCKS, v0.t
+       vrev8.v         v16, v16, v0.t  // Convert counters back to big-endian.
+       vse32.v         v16, (IVP)
+
+       ret
+.endm
+
+// void aes_ctr32_crypt_zvkned_zvkb(const struct crypto_aes_ctx *key,
+//                                 const u8 *in, u8 *out, size_t len,
+//                                 u8 iv[16]);
+SYM_FUNC_START(aes_ctr32_crypt_zvkned_zvkb)
+       aes_begin       KEYP, 128f, 192f
+       aes_ctr32_crypt 256
+128:
+       aes_ctr32_crypt 128
+192:
+       aes_ctr32_crypt 192
+SYM_FUNC_END(aes_ctr32_crypt_zvkned_zvkb)
diff --git a/arch/riscv/crypto/aes-riscv64-zvkned.S b/arch/riscv/crypto/aes-riscv64-zvkned.S
new file mode 100644 (file)
index 0000000..23d063f
--- /dev/null
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector AES block cipher extension ('Zvkned')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvkned
+
+#include "aes-macros.S"
+
+#define KEYP           a0
+#define INP            a1
+#define OUTP           a2
+#define LEN            a3
+#define IVP            a4
+
+.macro __aes_crypt_zvkned      enc, keylen
+       vle32.v         v16, (INP)
+       aes_crypt       v16, \enc, \keylen
+       vse32.v         v16, (OUTP)
+       ret
+.endm
+
+.macro aes_crypt_zvkned        enc
+       aes_begin       KEYP, 128f, 192f
+       __aes_crypt_zvkned      \enc, 256
+128:
+       __aes_crypt_zvkned      \enc, 128
+192:
+       __aes_crypt_zvkned      \enc, 192
+.endm
+
+// void aes_encrypt_zvkned(const struct crypto_aes_ctx *key,
+//                        const u8 in[16], u8 out[16]);
+SYM_FUNC_START(aes_encrypt_zvkned)
+       aes_crypt_zvkned        1
+SYM_FUNC_END(aes_encrypt_zvkned)
+
+// Same prototype and calling convention as the encryption function
+SYM_FUNC_START(aes_decrypt_zvkned)
+       aes_crypt_zvkned        0
+SYM_FUNC_END(aes_decrypt_zvkned)
+
+.macro __aes_ecb_crypt enc, keylen
+       srli            t0, LEN, 2
+       // t0 is the remaining length in 32-bit words.  It's a multiple of 4.
+1:
+       vsetvli         t1, t0, e32, m8, ta, ma
+       sub             t0, t0, t1      // Subtract number of words processed
+       slli            t1, t1, 2       // Words to bytes
+       vle32.v         v16, (INP)
+       aes_crypt       v16, \enc, \keylen
+       vse32.v         v16, (OUTP)
+       add             INP, INP, t1
+       add             OUTP, OUTP, t1
+       bnez            t0, 1b
+
+       ret
+.endm
+
+.macro aes_ecb_crypt   enc
+       aes_begin       KEYP, 128f, 192f
+       __aes_ecb_crypt \enc, 256
+128:
+       __aes_ecb_crypt \enc, 128
+192:
+       __aes_ecb_crypt \enc, 192
+.endm
+
+// void aes_ecb_encrypt_zvkned(const struct crypto_aes_ctx *key,
+//                            const u8 *in, u8 *out, size_t len);
+//
+// |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
+SYM_FUNC_START(aes_ecb_encrypt_zvkned)
+       aes_ecb_crypt   1
+SYM_FUNC_END(aes_ecb_encrypt_zvkned)
+
+// Same prototype and calling convention as the encryption function
+SYM_FUNC_START(aes_ecb_decrypt_zvkned)
+       aes_ecb_crypt   0
+SYM_FUNC_END(aes_ecb_decrypt_zvkned)
+
+.macro aes_cbc_encrypt keylen
+       vle32.v         v16, (IVP)      // Load IV
+1:
+       vle32.v         v17, (INP)      // Load plaintext block
+       vxor.vv         v16, v16, v17   // XOR with IV or prev ciphertext block
+       aes_encrypt     v16, \keylen    // Encrypt
+       vse32.v         v16, (OUTP)     // Store ciphertext block
+       addi            INP, INP, 16
+       addi            OUTP, OUTP, 16
+       addi            LEN, LEN, -16
+       bnez            LEN, 1b
+
+       vse32.v         v16, (IVP)      // Store next IV
+       ret
+.endm
+
+.macro aes_cbc_decrypt keylen
+       srli            LEN, LEN, 2     // Convert LEN from bytes to words
+       vle32.v         v16, (IVP)      // Load IV
+1:
+       vsetvli         t0, LEN, e32, m4, ta, ma
+       vle32.v         v20, (INP)      // Load ciphertext blocks
+       vslideup.vi     v16, v20, 4     // Setup prev ciphertext blocks
+       addi            t1, t0, -4
+       vslidedown.vx   v24, v20, t1    // Save last ciphertext block
+       aes_decrypt     v20, \keylen    // Decrypt the blocks
+       vxor.vv         v20, v20, v16   // XOR with prev ciphertext blocks
+       vse32.v         v20, (OUTP)     // Store plaintext blocks
+       vmv.v.v         v16, v24        // Next "IV" is last ciphertext block
+       slli            t1, t0, 2       // Words to bytes
+       add             INP, INP, t1
+       add             OUTP, OUTP, t1
+       sub             LEN, LEN, t0
+       bnez            LEN, 1b
+
+       vsetivli        zero, 4, e32, m1, ta, ma
+       vse32.v         v16, (IVP)      // Store next IV
+       ret
+.endm
+
+// void aes_cbc_encrypt_zvkned(const struct crypto_aes_ctx *key,
+//                            const u8 *in, u8 *out, size_t len, u8 iv[16]);
+//
+// |len| must be nonzero and a multiple of 16 (AES_BLOCK_SIZE).
+SYM_FUNC_START(aes_cbc_encrypt_zvkned)
+       aes_begin       KEYP, 128f, 192f
+       aes_cbc_encrypt 256
+128:
+       aes_cbc_encrypt 128
+192:
+       aes_cbc_encrypt 192
+SYM_FUNC_END(aes_cbc_encrypt_zvkned)
+
+// Same prototype and calling convention as the encryption function
+SYM_FUNC_START(aes_cbc_decrypt_zvkned)
+       aes_begin       KEYP, 128f, 192f
+       aes_cbc_decrypt 256
+128:
+       aes_cbc_decrypt 128
+192:
+       aes_cbc_decrypt 192
+SYM_FUNC_END(aes_cbc_decrypt_zvkned)
+
+.macro aes_cbc_cts_encrypt     keylen
+
+       // CBC-encrypt all blocks except the last.  But don't store the
+       // second-to-last block to the output buffer yet, since it will be
+       // handled specially in the ciphertext stealing step.  Exception: if the
+       // message is single-block, still encrypt the last (and only) block.
+       li              t0, 16
+       j               2f
+1:
+       vse32.v         v16, (OUTP)     // Store ciphertext block
+       addi            OUTP, OUTP, 16
+2:
+       vle32.v         v17, (INP)      // Load plaintext block
+       vxor.vv         v16, v16, v17   // XOR with IV or prev ciphertext block
+       aes_encrypt     v16, \keylen    // Encrypt
+       addi            INP, INP, 16
+       addi            LEN, LEN, -16
+       bgt             LEN, t0, 1b     // Repeat if more than one block remains
+
+       // Special case: if the message is a single block, just do CBC.
+       beqz            LEN, .Lcts_encrypt_done\@
+
+       // Encrypt the last two blocks using ciphertext stealing as follows:
+       //      C[n-1] = Encrypt(Encrypt(P[n-1] ^ C[n-2]) ^ P[n])
+       //      C[n] = Encrypt(P[n-1] ^ C[n-2])[0..LEN]
+       //
+       // C[i] denotes the i'th ciphertext block, and likewise P[i] the i'th
+       // plaintext block.  Block n, the last block, may be partial; its length
+       // is 1 <= LEN <= 16.  If there are only 2 blocks, C[n-2] means the IV.
+       //
+       // v16 already contains Encrypt(P[n-1] ^ C[n-2]).
+       // INP points to P[n].  OUTP points to where C[n-1] should go.
+       // To support in-place encryption, load P[n] before storing C[n].
+       addi            t0, OUTP, 16    // Get pointer to where C[n] should go
+       vsetvli         zero, LEN, e8, m1, tu, ma
+       vle8.v          v17, (INP)      // Load P[n]
+       vse8.v          v16, (t0)       // Store C[n]
+       vxor.vv         v16, v16, v17   // v16 = Encrypt(P[n-1] ^ C[n-2]) ^ P[n]
+       vsetivli        zero, 4, e32, m1, ta, ma
+       aes_encrypt     v16, \keylen
+.Lcts_encrypt_done\@:
+       vse32.v         v16, (OUTP)     // Store C[n-1] (or C[n] in single-block case)
+       ret
+.endm
+
+#define LEN32          t4 // Length of remaining full blocks in 32-bit words
+#define LEN_MOD16      t5 // Length of message in bytes mod 16
+
+.macro aes_cbc_cts_decrypt     keylen
+       andi            LEN32, LEN, ~15
+       srli            LEN32, LEN32, 2
+       andi            LEN_MOD16, LEN, 15
+
+       // Save C[n-2] in v28 so that it's available later during the ciphertext
+       // stealing step.  If there are fewer than three blocks, C[n-2] means
+       // the IV, otherwise it means the third-to-last ciphertext block.
+       vmv.v.v         v28, v16        // IV
+       add             t0, LEN, -33
+       bltz            t0, .Lcts_decrypt_loop\@
+       andi            t0, t0, ~15
+       add             t0, t0, INP
+       vle32.v         v28, (t0)
+
+       // CBC-decrypt all full blocks.  For the last full block, or the last 2
+       // full blocks if the message is block-aligned, this doesn't write the
+       // correct output blocks (unless the message is only a single block),
+       // because it XORs the wrong values with the raw AES plaintexts.  But we
+       // fix this after this loop without redoing the AES decryptions.  This
+       // approach allows more of the AES decryptions to be parallelized.
+.Lcts_decrypt_loop\@:
+       vsetvli         t0, LEN32, e32, m4, ta, ma
+       addi            t1, t0, -4
+       vle32.v         v20, (INP)      // Load next set of ciphertext blocks
+       vmv.v.v         v24, v16        // Get IV or last ciphertext block of prev set
+       vslideup.vi     v24, v20, 4     // Setup prev ciphertext blocks
+       vslidedown.vx   v16, v20, t1    // Save last ciphertext block of this set
+       aes_decrypt     v20, \keylen    // Decrypt this set of blocks
+       vxor.vv         v24, v24, v20   // XOR prev ciphertext blocks with decrypted blocks
+       vse32.v         v24, (OUTP)     // Store this set of plaintext blocks
+       sub             LEN32, LEN32, t0
+       slli            t0, t0, 2       // Words to bytes
+       add             INP, INP, t0
+       add             OUTP, OUTP, t0
+       bnez            LEN32, .Lcts_decrypt_loop\@
+
+       vsetivli        zero, 4, e32, m4, ta, ma
+       vslidedown.vx   v20, v20, t1    // Extract raw plaintext of last full block
+       addi            t0, OUTP, -16   // Get pointer to last full plaintext block
+       bnez            LEN_MOD16, .Lcts_decrypt_non_block_aligned\@
+
+       // Special case: if the message is a single block, just do CBC.
+       li              t1, 16
+       beq             LEN, t1, .Lcts_decrypt_done\@
+
+       // Block-aligned message.  Just fix up the last 2 blocks.  We need:
+       //
+       //      P[n-1] = Decrypt(C[n]) ^ C[n-2]
+       //      P[n] = Decrypt(C[n-1]) ^ C[n]
+       //
+       // We have C[n] in v16, Decrypt(C[n]) in v20, and C[n-2] in v28.
+       // Together with Decrypt(C[n-1]) ^ C[n-2] from the output buffer, this
+       // is everything needed to fix the output without re-decrypting blocks.
+       addi            t1, OUTP, -32   // Get pointer to where P[n-1] should go
+       vxor.vv         v20, v20, v28   // Decrypt(C[n]) ^ C[n-2] == P[n-1]
+       vle32.v         v24, (t1)       // Decrypt(C[n-1]) ^ C[n-2]
+       vse32.v         v20, (t1)       // Store P[n-1]
+       vxor.vv         v20, v24, v16   // Decrypt(C[n-1]) ^ C[n-2] ^ C[n] == P[n] ^ C[n-2]
+       j               .Lcts_decrypt_finish\@
+
+.Lcts_decrypt_non_block_aligned\@:
+       // Decrypt the last two blocks using ciphertext stealing as follows:
+       //
+       //      P[n-1] = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16]) ^ C[n-2]
+       //      P[n] = (Decrypt(C[n-1]) ^ C[n])[0..LEN_MOD16]
+       //
+       // We already have Decrypt(C[n-1]) in v20 and C[n-2] in v28.
+       vmv.v.v         v16, v20        // v16 = Decrypt(C[n-1])
+       vsetvli         zero, LEN_MOD16, e8, m1, tu, ma
+       vle8.v          v20, (INP)      // v20 = C[n] || Decrypt(C[n-1])[LEN_MOD16..16]
+       vxor.vv         v16, v16, v20   // v16 = Decrypt(C[n-1]) ^ C[n]
+       vse8.v          v16, (OUTP)     // Store P[n]
+       vsetivli        zero, 4, e32, m1, ta, ma
+       aes_decrypt     v20, \keylen    // v20 = Decrypt(C[n] || Decrypt(C[n-1])[LEN_MOD16..16])
+.Lcts_decrypt_finish\@:
+       vxor.vv         v20, v20, v28   // XOR with C[n-2]
+       vse32.v         v20, (t0)       // Store last full plaintext block
+.Lcts_decrypt_done\@:
+       ret
+.endm
+
+.macro aes_cbc_cts_crypt       keylen
+       vle32.v         v16, (IVP)      // Load IV
+       beqz            a5, .Lcts_decrypt\@
+       aes_cbc_cts_encrypt \keylen
+.Lcts_decrypt\@:
+       aes_cbc_cts_decrypt \keylen
+.endm
+
+// void aes_cbc_cts_crypt_zvkned(const struct crypto_aes_ctx *key,
+//                              const u8 *in, u8 *out, size_t len,
+//                              const u8 iv[16], bool enc);
+//
+// Encrypts or decrypts a message with the CS3 variant of AES-CBC-CTS.
+// This is the variant that unconditionally swaps the last two blocks.
+SYM_FUNC_START(aes_cbc_cts_crypt_zvkned)
+       aes_begin       KEYP, 128f, 192f
+       aes_cbc_cts_crypt 256
+128:
+       aes_cbc_cts_crypt 128
+192:
+       aes_cbc_cts_crypt 192
+SYM_FUNC_END(aes_cbc_cts_crypt_zvkned)
diff --git a/arch/riscv/crypto/chacha-riscv64-glue.c b/arch/riscv/crypto/chacha-riscv64-glue.c
new file mode 100644 (file)
index 0000000..10b46f3
--- /dev/null
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ChaCha20 using the RISC-V vector crypto extensions
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/linkage.h>
+#include <linux/module.h>
+
+asmlinkage void chacha20_zvkb(const u32 key[8], const u8 *in, u8 *out,
+                             size_t len, const u32 iv[4]);
+
+static int riscv64_chacha20_crypt(struct skcipher_request *req)
+{
+       u32 iv[CHACHA_IV_SIZE / sizeof(u32)];
+       u8 block_buffer[CHACHA_BLOCK_SIZE];
+       struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+       const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+       struct skcipher_walk walk;
+       unsigned int nbytes;
+       unsigned int tail_bytes;
+       int err;
+
+       iv[0] = get_unaligned_le32(req->iv);
+       iv[1] = get_unaligned_le32(req->iv + 4);
+       iv[2] = get_unaligned_le32(req->iv + 8);
+       iv[3] = get_unaligned_le32(req->iv + 12);
+
+       err = skcipher_walk_virt(&walk, req, false);
+       while (walk.nbytes) {
+               nbytes = walk.nbytes & ~(CHACHA_BLOCK_SIZE - 1);
+               tail_bytes = walk.nbytes & (CHACHA_BLOCK_SIZE - 1);
+               kernel_vector_begin();
+               if (nbytes) {
+                       chacha20_zvkb(ctx->key, walk.src.virt.addr,
+                                     walk.dst.virt.addr, nbytes, iv);
+                       iv[0] += nbytes / CHACHA_BLOCK_SIZE;
+               }
+               if (walk.nbytes == walk.total && tail_bytes > 0) {
+                       memcpy(block_buffer, walk.src.virt.addr + nbytes,
+                              tail_bytes);
+                       chacha20_zvkb(ctx->key, block_buffer, block_buffer,
+                                     CHACHA_BLOCK_SIZE, iv);
+                       memcpy(walk.dst.virt.addr + nbytes, block_buffer,
+                              tail_bytes);
+                       tail_bytes = 0;
+               }
+               kernel_vector_end();
+
+               err = skcipher_walk_done(&walk, tail_bytes);
+       }
+
+       return err;
+}
+
+static struct skcipher_alg riscv64_chacha_alg = {
+       .setkey = chacha20_setkey,
+       .encrypt = riscv64_chacha20_crypt,
+       .decrypt = riscv64_chacha20_crypt,
+       .min_keysize = CHACHA_KEY_SIZE,
+       .max_keysize = CHACHA_KEY_SIZE,
+       .ivsize = CHACHA_IV_SIZE,
+       .chunksize = CHACHA_BLOCK_SIZE,
+       .walksize = 4 * CHACHA_BLOCK_SIZE,
+       .base = {
+               .cra_blocksize = 1,
+               .cra_ctxsize = sizeof(struct chacha_ctx),
+               .cra_priority = 300,
+               .cra_name = "chacha20",
+               .cra_driver_name = "chacha20-riscv64-zvkb",
+               .cra_module = THIS_MODULE,
+       },
+};
+
+static int __init riscv64_chacha_mod_init(void)
+{
+       if (riscv_isa_extension_available(NULL, ZVKB) &&
+           riscv_vector_vlen() >= 128)
+               return crypto_register_skcipher(&riscv64_chacha_alg);
+
+       return -ENODEV;
+}
+
+static void __exit riscv64_chacha_mod_exit(void)
+{
+       crypto_unregister_skcipher(&riscv64_chacha_alg);
+}
+
+module_init(riscv64_chacha_mod_init);
+module_exit(riscv64_chacha_mod_exit);
+
+MODULE_DESCRIPTION("ChaCha20 (RISC-V accelerated)");
+MODULE_AUTHOR("Jerry Shih <jerry.shih@sifive.com>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("chacha20");
diff --git a/arch/riscv/crypto/chacha-riscv64-zvkb.S b/arch/riscv/crypto/chacha-riscv64-zvkb.S
new file mode 100644 (file)
index 0000000..bf05773
--- /dev/null
@@ -0,0 +1,294 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvkb
+
+#define KEYP           a0
+#define INP            a1
+#define OUTP           a2
+#define LEN            a3
+#define IVP            a4
+
+#define CONSTS0                a5
+#define CONSTS1                a6
+#define CONSTS2                a7
+#define CONSTS3                t0
+#define TMP            t1
+#define VL             t2
+#define STRIDE         t3
+#define NROUNDS                t4
+#define KEY0           s0
+#define KEY1           s1
+#define KEY2           s2
+#define KEY3           s3
+#define KEY4           s4
+#define KEY5           s5
+#define KEY6           s6
+#define KEY7           s7
+#define COUNTER                s8
+#define NONCE0         s9
+#define NONCE1         s10
+#define NONCE2         s11
+
+.macro chacha_round    a0, b0, c0, d0,  a1, b1, c1, d1, \
+                       a2, b2, c2, d2,  a3, b3, c3, d3
+       // a += b; d ^= a; d = rol(d, 16);
+       vadd.vv         \a0, \a0, \b0
+       vadd.vv         \a1, \a1, \b1
+       vadd.vv         \a2, \a2, \b2
+       vadd.vv         \a3, \a3, \b3
+       vxor.vv         \d0, \d0, \a0
+       vxor.vv         \d1, \d1, \a1
+       vxor.vv         \d2, \d2, \a2
+       vxor.vv         \d3, \d3, \a3
+       vror.vi         \d0, \d0, 32 - 16
+       vror.vi         \d1, \d1, 32 - 16
+       vror.vi         \d2, \d2, 32 - 16
+       vror.vi         \d3, \d3, 32 - 16
+
+       // c += d; b ^= c; b = rol(b, 12);
+       vadd.vv         \c0, \c0, \d0
+       vadd.vv         \c1, \c1, \d1
+       vadd.vv         \c2, \c2, \d2
+       vadd.vv         \c3, \c3, \d3
+       vxor.vv         \b0, \b0, \c0
+       vxor.vv         \b1, \b1, \c1
+       vxor.vv         \b2, \b2, \c2
+       vxor.vv         \b3, \b3, \c3
+       vror.vi         \b0, \b0, 32 - 12
+       vror.vi         \b1, \b1, 32 - 12
+       vror.vi         \b2, \b2, 32 - 12
+       vror.vi         \b3, \b3, 32 - 12
+
+       // a += b; d ^= a; d = rol(d, 8);
+       vadd.vv         \a0, \a0, \b0
+       vadd.vv         \a1, \a1, \b1
+       vadd.vv         \a2, \a2, \b2
+       vadd.vv         \a3, \a3, \b3
+       vxor.vv         \d0, \d0, \a0
+       vxor.vv         \d1, \d1, \a1
+       vxor.vv         \d2, \d2, \a2
+       vxor.vv         \d3, \d3, \a3
+       vror.vi         \d0, \d0, 32 - 8
+       vror.vi         \d1, \d1, 32 - 8
+       vror.vi         \d2, \d2, 32 - 8
+       vror.vi         \d3, \d3, 32 - 8
+
+       // c += d; b ^= c; b = rol(b, 7);
+       vadd.vv         \c0, \c0, \d0
+       vadd.vv         \c1, \c1, \d1
+       vadd.vv         \c2, \c2, \d2
+       vadd.vv         \c3, \c3, \d3
+       vxor.vv         \b0, \b0, \c0
+       vxor.vv         \b1, \b1, \c1
+       vxor.vv         \b2, \b2, \c2
+       vxor.vv         \b3, \b3, \c3
+       vror.vi         \b0, \b0, 32 - 7
+       vror.vi         \b1, \b1, 32 - 7
+       vror.vi         \b2, \b2, 32 - 7
+       vror.vi         \b3, \b3, 32 - 7
+.endm
+
+// void chacha20_zvkb(const u32 key[8], const u8 *in, u8 *out, size_t len,
+//                   const u32 iv[4]);
+//
+// |len| must be nonzero and a multiple of 64 (CHACHA_BLOCK_SIZE).
+// The counter is treated as 32-bit, following the RFC7539 convention.
+SYM_FUNC_START(chacha20_zvkb)
+       srli            LEN, LEN, 6     // Bytes to blocks
+
+       addi            sp, sp, -96
+       sd              s0, 0(sp)
+       sd              s1, 8(sp)
+       sd              s2, 16(sp)
+       sd              s3, 24(sp)
+       sd              s4, 32(sp)
+       sd              s5, 40(sp)
+       sd              s6, 48(sp)
+       sd              s7, 56(sp)
+       sd              s8, 64(sp)
+       sd              s9, 72(sp)
+       sd              s10, 80(sp)
+       sd              s11, 88(sp)
+
+       li              STRIDE, 64
+
+       // Set up the initial state matrix in scalar registers.
+       li              CONSTS0, 0x61707865     // "expa" little endian
+       li              CONSTS1, 0x3320646e     // "nd 3" little endian
+       li              CONSTS2, 0x79622d32     // "2-by" little endian
+       li              CONSTS3, 0x6b206574     // "te k" little endian
+       lw              KEY0, 0(KEYP)
+       lw              KEY1, 4(KEYP)
+       lw              KEY2, 8(KEYP)
+       lw              KEY3, 12(KEYP)
+       lw              KEY4, 16(KEYP)
+       lw              KEY5, 20(KEYP)
+       lw              KEY6, 24(KEYP)
+       lw              KEY7, 28(KEYP)
+       lw              COUNTER, 0(IVP)
+       lw              NONCE0, 4(IVP)
+       lw              NONCE1, 8(IVP)
+       lw              NONCE2, 12(IVP)
+
+.Lblock_loop:
+       // Set vl to the number of blocks to process in this iteration.
+       vsetvli         VL, LEN, e32, m1, ta, ma
+
+       // Set up the initial state matrix for the next VL blocks in v0-v15.
+       // v{i} holds the i'th 32-bit word of the state matrix for all blocks.
+       // Note that only the counter word, at index 12, differs across blocks.
+       vmv.v.x         v0, CONSTS0
+       vmv.v.x         v1, CONSTS1
+       vmv.v.x         v2, CONSTS2
+       vmv.v.x         v3, CONSTS3
+       vmv.v.x         v4, KEY0
+       vmv.v.x         v5, KEY1
+       vmv.v.x         v6, KEY2
+       vmv.v.x         v7, KEY3
+       vmv.v.x         v8, KEY4
+       vmv.v.x         v9, KEY5
+       vmv.v.x         v10, KEY6
+       vmv.v.x         v11, KEY7
+       vid.v           v12
+       vadd.vx         v12, v12, COUNTER
+       vmv.v.x         v13, NONCE0
+       vmv.v.x         v14, NONCE1
+       vmv.v.x         v15, NONCE2
+
+       // Load the first half of the input data for each block into v16-v23.
+       // v{16+i} holds the i'th 32-bit word for all blocks.
+       vlsseg8e32.v    v16, (INP), STRIDE
+
+       li              NROUNDS, 20
+.Lnext_doubleround:
+       addi            NROUNDS, NROUNDS, -2
+       // column round
+       chacha_round    v0, v4, v8, v12, v1, v5, v9, v13, \
+                       v2, v6, v10, v14, v3, v7, v11, v15
+       // diagonal round
+       chacha_round    v0, v5, v10, v15, v1, v6, v11, v12, \
+                       v2, v7, v8, v13, v3, v4, v9, v14
+       bnez            NROUNDS, .Lnext_doubleround
+
+       // Load the second half of the input data for each block into v24-v31.
+       // v{24+i} holds the {8+i}'th 32-bit word for all blocks.
+       addi            TMP, INP, 32
+       vlsseg8e32.v    v24, (TMP), STRIDE
+
+       // Finalize the first half of the keystream for each block.
+       vadd.vx         v0, v0, CONSTS0
+       vadd.vx         v1, v1, CONSTS1
+       vadd.vx         v2, v2, CONSTS2
+       vadd.vx         v3, v3, CONSTS3
+       vadd.vx         v4, v4, KEY0
+       vadd.vx         v5, v5, KEY1
+       vadd.vx         v6, v6, KEY2
+       vadd.vx         v7, v7, KEY3
+
+       // Encrypt/decrypt the first half of the data for each block.
+       vxor.vv         v16, v16, v0
+       vxor.vv         v17, v17, v1
+       vxor.vv         v18, v18, v2
+       vxor.vv         v19, v19, v3
+       vxor.vv         v20, v20, v4
+       vxor.vv         v21, v21, v5
+       vxor.vv         v22, v22, v6
+       vxor.vv         v23, v23, v7
+
+       // Store the first half of the output data for each block.
+       vssseg8e32.v    v16, (OUTP), STRIDE
+
+       // Finalize the second half of the keystream for each block.
+       vadd.vx         v8, v8, KEY4
+       vadd.vx         v9, v9, KEY5
+       vadd.vx         v10, v10, KEY6
+       vadd.vx         v11, v11, KEY7
+       vid.v           v0
+       vadd.vx         v12, v12, COUNTER
+       vadd.vx         v13, v13, NONCE0
+       vadd.vx         v14, v14, NONCE1
+       vadd.vx         v15, v15, NONCE2
+       vadd.vv         v12, v12, v0
+
+       // Encrypt/decrypt the second half of the data for each block.
+       vxor.vv         v24, v24, v8
+       vxor.vv         v25, v25, v9
+       vxor.vv         v26, v26, v10
+       vxor.vv         v27, v27, v11
+       vxor.vv         v29, v29, v13
+       vxor.vv         v28, v28, v12
+       vxor.vv         v30, v30, v14
+       vxor.vv         v31, v31, v15
+
+       // Store the second half of the output data for each block.
+       addi            TMP, OUTP, 32
+       vssseg8e32.v    v24, (TMP), STRIDE
+
+       // Update the counter, the remaining number of blocks, and the input and
+       // output pointers according to the number of blocks processed (VL).
+       add             COUNTER, COUNTER, VL
+       sub             LEN, LEN, VL
+       slli            TMP, VL, 6
+       add             OUTP, OUTP, TMP
+       add             INP, INP, TMP
+       bnez            LEN, .Lblock_loop
+
+       ld              s0, 0(sp)
+       ld              s1, 8(sp)
+       ld              s2, 16(sp)
+       ld              s3, 24(sp)
+       ld              s4, 32(sp)
+       ld              s5, 40(sp)
+       ld              s6, 48(sp)
+       ld              s7, 56(sp)
+       ld              s8, 64(sp)
+       ld              s9, 72(sp)
+       ld              s10, 80(sp)
+       ld              s11, 88(sp)
+       addi            sp, sp, 96
+       ret
+SYM_FUNC_END(chacha20_zvkb)
diff --git a/arch/riscv/crypto/ghash-riscv64-glue.c b/arch/riscv/crypto/ghash-riscv64-glue.c
new file mode 100644 (file)
index 0000000..312e789
--- /dev/null
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * GHASH using the RISC-V vector crypto extensions
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/ghash.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <linux/linkage.h>
+#include <linux/module.h>
+
+asmlinkage void ghash_zvkg(be128 *accumulator, const be128 *key, const u8 *data,
+                          size_t len);
+
+struct riscv64_ghash_tfm_ctx {
+       be128 key;
+};
+
+struct riscv64_ghash_desc_ctx {
+       be128 accumulator;
+       u8 buffer[GHASH_BLOCK_SIZE];
+       u32 bytes;
+};
+
+static int riscv64_ghash_setkey(struct crypto_shash *tfm, const u8 *key,
+                               unsigned int keylen)
+{
+       struct riscv64_ghash_tfm_ctx *tctx = crypto_shash_ctx(tfm);
+
+       if (keylen != GHASH_BLOCK_SIZE)
+               return -EINVAL;
+
+       memcpy(&tctx->key, key, GHASH_BLOCK_SIZE);
+
+       return 0;
+}
+
+static int riscv64_ghash_init(struct shash_desc *desc)
+{
+       struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+
+       *dctx = (struct riscv64_ghash_desc_ctx){};
+
+       return 0;
+}
+
+static inline void
+riscv64_ghash_blocks(const struct riscv64_ghash_tfm_ctx *tctx,
+                    struct riscv64_ghash_desc_ctx *dctx,
+                    const u8 *src, size_t srclen)
+{
+       /* The srclen is nonzero and a multiple of 16. */
+       if (crypto_simd_usable()) {
+               kernel_vector_begin();
+               ghash_zvkg(&dctx->accumulator, &tctx->key, src, srclen);
+               kernel_vector_end();
+       } else {
+               do {
+                       crypto_xor((u8 *)&dctx->accumulator, src,
+                                  GHASH_BLOCK_SIZE);
+                       gf128mul_lle(&dctx->accumulator, &tctx->key);
+                       src += GHASH_BLOCK_SIZE;
+                       srclen -= GHASH_BLOCK_SIZE;
+               } while (srclen);
+       }
+}
+
+static int riscv64_ghash_update(struct shash_desc *desc, const u8 *src,
+                               unsigned int srclen)
+{
+       const struct riscv64_ghash_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+       struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+       unsigned int len;
+
+       if (dctx->bytes) {
+               if (dctx->bytes + srclen < GHASH_BLOCK_SIZE) {
+                       memcpy(dctx->buffer + dctx->bytes, src, srclen);
+                       dctx->bytes += srclen;
+                       return 0;
+               }
+               memcpy(dctx->buffer + dctx->bytes, src,
+                      GHASH_BLOCK_SIZE - dctx->bytes);
+               riscv64_ghash_blocks(tctx, dctx, dctx->buffer,
+                                    GHASH_BLOCK_SIZE);
+               src += GHASH_BLOCK_SIZE - dctx->bytes;
+               srclen -= GHASH_BLOCK_SIZE - dctx->bytes;
+               dctx->bytes = 0;
+       }
+
+       len = round_down(srclen, GHASH_BLOCK_SIZE);
+       if (len) {
+               riscv64_ghash_blocks(tctx, dctx, src, len);
+               src += len;
+               srclen -= len;
+       }
+
+       if (srclen) {
+               memcpy(dctx->buffer, src, srclen);
+               dctx->bytes = srclen;
+       }
+
+       return 0;
+}
+
+static int riscv64_ghash_final(struct shash_desc *desc, u8 *out)
+{
+       const struct riscv64_ghash_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm);
+       struct riscv64_ghash_desc_ctx *dctx = shash_desc_ctx(desc);
+       int i;
+
+       if (dctx->bytes) {
+               for (i = dctx->bytes; i < GHASH_BLOCK_SIZE; i++)
+                       dctx->buffer[i] = 0;
+
+               riscv64_ghash_blocks(tctx, dctx, dctx->buffer,
+                                    GHASH_BLOCK_SIZE);
+       }
+
+       memcpy(out, &dctx->accumulator, GHASH_DIGEST_SIZE);
+       return 0;
+}
+
+static struct shash_alg riscv64_ghash_alg = {
+       .init = riscv64_ghash_init,
+       .update = riscv64_ghash_update,
+       .final = riscv64_ghash_final,
+       .setkey = riscv64_ghash_setkey,
+       .descsize = sizeof(struct riscv64_ghash_desc_ctx),
+       .digestsize = GHASH_DIGEST_SIZE,
+       .base = {
+               .cra_blocksize = GHASH_BLOCK_SIZE,
+               .cra_ctxsize = sizeof(struct riscv64_ghash_tfm_ctx),
+               .cra_priority = 300,
+               .cra_name = "ghash",
+               .cra_driver_name = "ghash-riscv64-zvkg",
+               .cra_module = THIS_MODULE,
+       },
+};
+
+static int __init riscv64_ghash_mod_init(void)
+{
+       if (riscv_isa_extension_available(NULL, ZVKG) &&
+           riscv_vector_vlen() >= 128)
+               return crypto_register_shash(&riscv64_ghash_alg);
+
+       return -ENODEV;
+}
+
+static void __exit riscv64_ghash_mod_exit(void)
+{
+       crypto_unregister_shash(&riscv64_ghash_alg);
+}
+
+module_init(riscv64_ghash_mod_init);
+module_exit(riscv64_ghash_mod_exit);
+
+MODULE_DESCRIPTION("GHASH (RISC-V accelerated)");
+MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("ghash");
diff --git a/arch/riscv/crypto/ghash-riscv64-zvkg.S b/arch/riscv/crypto/ghash-riscv64-zvkg.S
new file mode 100644 (file)
index 0000000..f2b43fb
--- /dev/null
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector GCM/GMAC extension ('Zvkg')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvkg
+
+#define ACCUMULATOR    a0
+#define KEY            a1
+#define DATA           a2
+#define LEN            a3
+
+// void ghash_zvkg(be128 *accumulator, const be128 *key, const u8 *data,
+//                size_t len);
+//
+// |len| must be nonzero and a multiple of 16 (GHASH_BLOCK_SIZE).
+SYM_FUNC_START(ghash_zvkg)
+       vsetivli        zero, 4, e32, m1, ta, ma
+       vle32.v         v1, (ACCUMULATOR)
+       vle32.v         v2, (KEY)
+.Lnext_block:
+       vle32.v         v3, (DATA)
+       vghsh.vv        v1, v2, v3
+       addi            DATA, DATA, 16
+       addi            LEN, LEN, -16
+       bnez            LEN, .Lnext_block
+
+       vse32.v         v1, (ACCUMULATOR)
+       ret
+SYM_FUNC_END(ghash_zvkg)
diff --git a/arch/riscv/crypto/sha256-riscv64-glue.c b/arch/riscv/crypto/sha256-riscv64-glue.c
new file mode 100644 (file)
index 0000000..71e051e
--- /dev/null
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-256 and SHA-224 using the RISC-V vector crypto extensions
+ *
+ * Copyright (C) 2022 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sha256_base.h>
+#include <linux/linkage.h>
+#include <linux/module.h>
+
+/*
+ * Note: the asm function only uses the 'state' field of struct sha256_state.
+ * It is assumed to be the first field.
+ */
+asmlinkage void sha256_transform_zvknha_or_zvknhb_zvkb(
+       struct sha256_state *state, const u8 *data, int num_blocks);
+
+static int riscv64_sha256_update(struct shash_desc *desc, const u8 *data,
+                                unsigned int len)
+{
+       /*
+        * Ensure struct sha256_state begins directly with the SHA-256
+        * 256-bit internal state, as this is what the asm function expects.
+        */
+       BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
+
+       if (crypto_simd_usable()) {
+               kernel_vector_begin();
+               sha256_base_do_update(desc, data, len,
+                                     sha256_transform_zvknha_or_zvknhb_zvkb);
+               kernel_vector_end();
+       } else {
+               crypto_sha256_update(desc, data, len);
+       }
+       return 0;
+}
+
+static int riscv64_sha256_finup(struct shash_desc *desc, const u8 *data,
+                               unsigned int len, u8 *out)
+{
+       if (crypto_simd_usable()) {
+               kernel_vector_begin();
+               if (len)
+                       sha256_base_do_update(
+                               desc, data, len,
+                               sha256_transform_zvknha_or_zvknhb_zvkb);
+               sha256_base_do_finalize(
+                       desc, sha256_transform_zvknha_or_zvknhb_zvkb);
+               kernel_vector_end();
+
+               return sha256_base_finish(desc, out);
+       }
+
+       return crypto_sha256_finup(desc, data, len, out);
+}
+
+static int riscv64_sha256_final(struct shash_desc *desc, u8 *out)
+{
+       return riscv64_sha256_finup(desc, NULL, 0, out);
+}
+
+static int riscv64_sha256_digest(struct shash_desc *desc, const u8 *data,
+                                unsigned int len, u8 *out)
+{
+       return sha256_base_init(desc) ?:
+              riscv64_sha256_finup(desc, data, len, out);
+}
+
+static struct shash_alg riscv64_sha256_algs[] = {
+       {
+               .init = sha256_base_init,
+               .update = riscv64_sha256_update,
+               .final = riscv64_sha256_final,
+               .finup = riscv64_sha256_finup,
+               .digest = riscv64_sha256_digest,
+               .descsize = sizeof(struct sha256_state),
+               .digestsize = SHA256_DIGEST_SIZE,
+               .base = {
+                       .cra_blocksize = SHA256_BLOCK_SIZE,
+                       .cra_priority = 300,
+                       .cra_name = "sha256",
+                       .cra_driver_name = "sha256-riscv64-zvknha_or_zvknhb-zvkb",
+                       .cra_module = THIS_MODULE,
+               },
+       }, {
+               .init = sha224_base_init,
+               .update = riscv64_sha256_update,
+               .final = riscv64_sha256_final,
+               .finup = riscv64_sha256_finup,
+               .descsize = sizeof(struct sha256_state),
+               .digestsize = SHA224_DIGEST_SIZE,
+               .base = {
+                       .cra_blocksize = SHA224_BLOCK_SIZE,
+                       .cra_priority = 300,
+                       .cra_name = "sha224",
+                       .cra_driver_name = "sha224-riscv64-zvknha_or_zvknhb-zvkb",
+                       .cra_module = THIS_MODULE,
+               },
+       },
+};
+
+static int __init riscv64_sha256_mod_init(void)
+{
+       /* Both zvknha and zvknhb provide the SHA-256 instructions. */
+       if ((riscv_isa_extension_available(NULL, ZVKNHA) ||
+            riscv_isa_extension_available(NULL, ZVKNHB)) &&
+           riscv_isa_extension_available(NULL, ZVKB) &&
+           riscv_vector_vlen() >= 128)
+               return crypto_register_shashes(riscv64_sha256_algs,
+                                              ARRAY_SIZE(riscv64_sha256_algs));
+
+       return -ENODEV;
+}
+
+static void __exit riscv64_sha256_mod_exit(void)
+{
+       crypto_unregister_shashes(riscv64_sha256_algs,
+                                 ARRAY_SIZE(riscv64_sha256_algs));
+}
+
+module_init(riscv64_sha256_mod_init);
+module_exit(riscv64_sha256_mod_exit);
+
+MODULE_DESCRIPTION("SHA-256 (RISC-V accelerated)");
+MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("sha256");
+MODULE_ALIAS_CRYPTO("sha224");
diff --git a/arch/riscv/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S b/arch/riscv/crypto/sha256-riscv64-zvknha_or_zvknhb-zvkb.S
new file mode 100644 (file)
index 0000000..8ebcc17
--- /dev/null
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknha' or 'Zvknhb')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/cfi_types.h>
+
+.text
+.option arch, +zvknha, +zvkb
+
+#define STATEP         a0
+#define DATA           a1
+#define NUM_BLOCKS     a2
+
+#define STATEP_C       a3
+
+#define MASK           v0
+#define INDICES                v1
+#define W0             v2
+#define W1             v3
+#define W2             v4
+#define W3             v5
+#define VTMP           v6
+#define FEBA           v7
+#define HGDC           v8
+#define K0             v10
+#define K1             v11
+#define K2             v12
+#define K3             v13
+#define K4             v14
+#define K5             v15
+#define K6             v16
+#define K7             v17
+#define K8             v18
+#define K9             v19
+#define K10            v20
+#define K11            v21
+#define K12            v22
+#define K13            v23
+#define K14            v24
+#define K15            v25
+#define PREV_FEBA      v26
+#define PREV_HGDC      v27
+
+// Do 4 rounds of SHA-256.  w0 contains the current 4 message schedule words.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words.  w1-w3 contain the next 3 groups of 4
+// message schedule words; this macro computes the group after w3 and writes it
+// to w0.  This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
+// w0), so the caller must cycle through the registers accordingly.
+.macro sha256_4rounds  last, k, w0, w1, w2, w3
+       vadd.vv         VTMP, \k, \w0
+       vsha2cl.vv      HGDC, FEBA, VTMP
+       vsha2ch.vv      FEBA, HGDC, VTMP
+.if !\last
+       vmerge.vvm      VTMP, \w2, \w1, MASK
+       vsha2ms.vv      \w0, VTMP, \w3
+.endif
+.endm
+
+.macro sha256_16rounds last, k0, k1, k2, k3
+       sha256_4rounds  \last, \k0, W0, W1, W2, W3
+       sha256_4rounds  \last, \k1, W1, W2, W3, W0
+       sha256_4rounds  \last, \k2, W2, W3, W0, W1
+       sha256_4rounds  \last, \k3, W3, W0, W1, W2
+.endm
+
+// void sha256_transform_zvknha_or_zvknhb_zvkb(u32 state[8], const u8 *data,
+//                                            int num_blocks);
+SYM_TYPED_FUNC_START(sha256_transform_zvknha_or_zvknhb_zvkb)
+
+       // Load the round constants into K0-K15.
+       vsetivli        zero, 4, e32, m1, ta, ma
+       la              t0, K256
+       vle32.v         K0, (t0)
+       addi            t0, t0, 16
+       vle32.v         K1, (t0)
+       addi            t0, t0, 16
+       vle32.v         K2, (t0)
+       addi            t0, t0, 16
+       vle32.v         K3, (t0)
+       addi            t0, t0, 16
+       vle32.v         K4, (t0)
+       addi            t0, t0, 16
+       vle32.v         K5, (t0)
+       addi            t0, t0, 16
+       vle32.v         K6, (t0)
+       addi            t0, t0, 16
+       vle32.v         K7, (t0)
+       addi            t0, t0, 16
+       vle32.v         K8, (t0)
+       addi            t0, t0, 16
+       vle32.v         K9, (t0)
+       addi            t0, t0, 16
+       vle32.v         K10, (t0)
+       addi            t0, t0, 16
+       vle32.v         K11, (t0)
+       addi            t0, t0, 16
+       vle32.v         K12, (t0)
+       addi            t0, t0, 16
+       vle32.v         K13, (t0)
+       addi            t0, t0, 16
+       vle32.v         K14, (t0)
+       addi            t0, t0, 16
+       vle32.v         K15, (t0)
+
+       // Setup mask for the vmerge to replace the first word (idx==0) in
+       // message scheduling.  There are 4 words, so an 8-bit mask suffices.
+       vsetivli        zero, 1, e8, m1, ta, ma
+       vmv.v.i         MASK, 0x01
+
+       // Load the state.  The state is stored as {a,b,c,d,e,f,g,h}, but we
+       // need {f,e,b,a},{h,g,d,c}.  The dst vtype is e32m1 and the index vtype
+       // is e8mf4.  We use index-load with the i8 indices {20, 16, 4, 0},
+       // loaded using the 32-bit little endian value 0x00041014.
+       li              t0, 0x00041014
+       vsetivli        zero, 1, e32, m1, ta, ma
+       vmv.v.x         INDICES, t0
+       addi            STATEP_C, STATEP, 8
+       vsetivli        zero, 4, e32, m1, ta, ma
+       vluxei8.v       FEBA, (STATEP), INDICES
+       vluxei8.v       HGDC, (STATEP_C), INDICES
+
+.Lnext_block:
+       addi            NUM_BLOCKS, NUM_BLOCKS, -1
+
+       // Save the previous state, as it's needed later.
+       vmv.v.v         PREV_FEBA, FEBA
+       vmv.v.v         PREV_HGDC, HGDC
+
+       // Load the next 512-bit message block and endian-swap each 32-bit word.
+       vle32.v         W0, (DATA)
+       vrev8.v         W0, W0
+       addi            DATA, DATA, 16
+       vle32.v         W1, (DATA)
+       vrev8.v         W1, W1
+       addi            DATA, DATA, 16
+       vle32.v         W2, (DATA)
+       vrev8.v         W2, W2
+       addi            DATA, DATA, 16
+       vle32.v         W3, (DATA)
+       vrev8.v         W3, W3
+       addi            DATA, DATA, 16
+
+       // Do the 64 rounds of SHA-256.
+       sha256_16rounds 0, K0, K1, K2, K3
+       sha256_16rounds 0, K4, K5, K6, K7
+       sha256_16rounds 0, K8, K9, K10, K11
+       sha256_16rounds 1, K12, K13, K14, K15
+
+       // Add the previous state.
+       vadd.vv         FEBA, FEBA, PREV_FEBA
+       vadd.vv         HGDC, HGDC, PREV_HGDC
+
+       // Repeat if more blocks remain.
+       bnez            NUM_BLOCKS, .Lnext_block
+
+       // Store the new state and return.
+       vsuxei8.v       FEBA, (STATEP), INDICES
+       vsuxei8.v       HGDC, (STATEP_C), INDICES
+       ret
+SYM_FUNC_END(sha256_transform_zvknha_or_zvknhb_zvkb)
+
+.section ".rodata"
+.p2align 2
+.type K256, @object
+K256:
+       .word           0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+       .word           0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+       .word           0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+       .word           0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+       .word           0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+       .word           0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+       .word           0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+       .word           0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+       .word           0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+       .word           0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+       .word           0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+       .word           0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+       .word           0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+       .word           0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+       .word           0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+       .word           0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+.size K256, . - K256
diff --git a/arch/riscv/crypto/sha512-riscv64-glue.c b/arch/riscv/crypto/sha512-riscv64-glue.c
new file mode 100644 (file)
index 0000000..43b56a0
--- /dev/null
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SHA-512 and SHA-384 using the RISC-V vector crypto extensions
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sha512_base.h>
+#include <linux/linkage.h>
+#include <linux/module.h>
+
+/*
+ * Note: the asm function only uses the 'state' field of struct sha512_state.
+ * It is assumed to be the first field.
+ */
+asmlinkage void sha512_transform_zvknhb_zvkb(
+       struct sha512_state *state, const u8 *data, int num_blocks);
+
+static int riscv64_sha512_update(struct shash_desc *desc, const u8 *data,
+                                unsigned int len)
+{
+       /*
+        * Ensure struct sha512_state begins directly with the SHA-512
+        * 512-bit internal state, as this is what the asm function expects.
+        */
+       BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
+
+       if (crypto_simd_usable()) {
+               kernel_vector_begin();
+               sha512_base_do_update(desc, data, len,
+                                     sha512_transform_zvknhb_zvkb);
+               kernel_vector_end();
+       } else {
+               crypto_sha512_update(desc, data, len);
+       }
+       return 0;
+}
+
+static int riscv64_sha512_finup(struct shash_desc *desc, const u8 *data,
+                               unsigned int len, u8 *out)
+{
+       if (crypto_simd_usable()) {
+               kernel_vector_begin();
+               if (len)
+                       sha512_base_do_update(desc, data, len,
+                                             sha512_transform_zvknhb_zvkb);
+               sha512_base_do_finalize(desc, sha512_transform_zvknhb_zvkb);
+               kernel_vector_end();
+
+               return sha512_base_finish(desc, out);
+       }
+
+       return crypto_sha512_finup(desc, data, len, out);
+}
+
+static int riscv64_sha512_final(struct shash_desc *desc, u8 *out)
+{
+       return riscv64_sha512_finup(desc, NULL, 0, out);
+}
+
+static int riscv64_sha512_digest(struct shash_desc *desc, const u8 *data,
+                                unsigned int len, u8 *out)
+{
+       return sha512_base_init(desc) ?:
+              riscv64_sha512_finup(desc, data, len, out);
+}
+
+static struct shash_alg riscv64_sha512_algs[] = {
+       {
+               .init = sha512_base_init,
+               .update = riscv64_sha512_update,
+               .final = riscv64_sha512_final,
+               .finup = riscv64_sha512_finup,
+               .digest = riscv64_sha512_digest,
+               .descsize = sizeof(struct sha512_state),
+               .digestsize = SHA512_DIGEST_SIZE,
+               .base = {
+                       .cra_blocksize = SHA512_BLOCK_SIZE,
+                       .cra_priority = 300,
+                       .cra_name = "sha512",
+                       .cra_driver_name = "sha512-riscv64-zvknhb-zvkb",
+                       .cra_module = THIS_MODULE,
+               },
+       }, {
+               .init = sha384_base_init,
+               .update = riscv64_sha512_update,
+               .final = riscv64_sha512_final,
+               .finup = riscv64_sha512_finup,
+               .descsize = sizeof(struct sha512_state),
+               .digestsize = SHA384_DIGEST_SIZE,
+               .base = {
+                       .cra_blocksize = SHA384_BLOCK_SIZE,
+                       .cra_priority = 300,
+                       .cra_name = "sha384",
+                       .cra_driver_name = "sha384-riscv64-zvknhb-zvkb",
+                       .cra_module = THIS_MODULE,
+               },
+       },
+};
+
+static int __init riscv64_sha512_mod_init(void)
+{
+       if (riscv_isa_extension_available(NULL, ZVKNHB) &&
+           riscv_isa_extension_available(NULL, ZVKB) &&
+           riscv_vector_vlen() >= 128)
+               return crypto_register_shashes(riscv64_sha512_algs,
+                                              ARRAY_SIZE(riscv64_sha512_algs));
+
+       return -ENODEV;
+}
+
+static void __exit riscv64_sha512_mod_exit(void)
+{
+       crypto_unregister_shashes(riscv64_sha512_algs,
+                                 ARRAY_SIZE(riscv64_sha512_algs));
+}
+
+module_init(riscv64_sha512_mod_init);
+module_exit(riscv64_sha512_mod_exit);
+
+MODULE_DESCRIPTION("SHA-512 (RISC-V accelerated)");
+MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("sha512");
+MODULE_ALIAS_CRYPTO("sha384");
diff --git a/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S b/arch/riscv/crypto/sha512-riscv64-zvknhb-zvkb.S
new file mode 100644 (file)
index 0000000..3a9ae21
--- /dev/null
@@ -0,0 +1,203 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Phoebe Chen <phoebe.chen@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector SHA-2 Secure Hash extension ('Zvknhb')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/cfi_types.h>
+
+.text
+.option arch, +zvknhb, +zvkb
+
+#define STATEP         a0
+#define DATA           a1
+#define NUM_BLOCKS     a2
+
+#define STATEP_C       a3
+#define K              a4
+
+#define MASK           v0
+#define INDICES                v1
+#define W0             v10     // LMUL=2
+#define W1             v12     // LMUL=2
+#define W2             v14     // LMUL=2
+#define W3             v16     // LMUL=2
+#define VTMP           v20     // LMUL=2
+#define FEBA           v22     // LMUL=2
+#define HGDC           v24     // LMUL=2
+#define PREV_FEBA      v26     // LMUL=2
+#define PREV_HGDC      v28     // LMUL=2
+
+// Do 4 rounds of SHA-512.  w0 contains the current 4 message schedule words.
+//
+// If not all the message schedule words have been computed yet, then this also
+// computes 4 more message schedule words.  w1-w3 contain the next 3 groups of 4
+// message schedule words; this macro computes the group after w3 and writes it
+// to w0.  This means that the next (w0, w1, w2, w3) is the current (w1, w2, w3,
+// w0), so the caller must cycle through the registers accordingly.
+.macro sha512_4rounds  last, w0, w1, w2, w3
+       vle64.v         VTMP, (K)
+       addi            K, K, 32
+       vadd.vv         VTMP, VTMP, \w0
+       vsha2cl.vv      HGDC, FEBA, VTMP
+       vsha2ch.vv      FEBA, HGDC, VTMP
+.if !\last
+       vmerge.vvm      VTMP, \w2, \w1, MASK
+       vsha2ms.vv      \w0, VTMP, \w3
+.endif
+.endm
+
+.macro sha512_16rounds last
+       sha512_4rounds  \last, W0, W1, W2, W3
+       sha512_4rounds  \last, W1, W2, W3, W0
+       sha512_4rounds  \last, W2, W3, W0, W1
+       sha512_4rounds  \last, W3, W0, W1, W2
+.endm
+
+// void sha512_transform_zvknhb_zvkb(u64 state[8], const u8 *data,
+//                                  int num_blocks);
+SYM_TYPED_FUNC_START(sha512_transform_zvknhb_zvkb)
+
+       // Setup mask for the vmerge to replace the first word (idx==0) in
+       // message scheduling.  There are 4 words, so an 8-bit mask suffices.
+       vsetivli        zero, 1, e8, m1, ta, ma
+       vmv.v.i         MASK, 0x01
+
+       // Load the state.  The state is stored as {a,b,c,d,e,f,g,h}, but we
+       // need {f,e,b,a},{h,g,d,c}.  The dst vtype is e64m2 and the index vtype
+       // is e8mf4.  We use index-load with the i8 indices {40, 32, 8, 0},
+       // loaded using the 32-bit little endian value 0x00082028.
+       li              t0, 0x00082028
+       vsetivli        zero, 1, e32, m1, ta, ma
+       vmv.v.x         INDICES, t0
+       addi            STATEP_C, STATEP, 16
+       vsetivli        zero, 4, e64, m2, ta, ma
+       vluxei8.v       FEBA, (STATEP), INDICES
+       vluxei8.v       HGDC, (STATEP_C), INDICES
+
+.Lnext_block:
+       la              K, K512
+       addi            NUM_BLOCKS, NUM_BLOCKS, -1
+
+       // Save the previous state, as it's needed later.
+       vmv.v.v         PREV_FEBA, FEBA
+       vmv.v.v         PREV_HGDC, HGDC
+
+       // Load the next 1024-bit message block and endian-swap each 64-bit word
+       vle64.v         W0, (DATA)
+       vrev8.v         W0, W0
+       addi            DATA, DATA, 32
+       vle64.v         W1, (DATA)
+       vrev8.v         W1, W1
+       addi            DATA, DATA, 32
+       vle64.v         W2, (DATA)
+       vrev8.v         W2, W2
+       addi            DATA, DATA, 32
+       vle64.v         W3, (DATA)
+       vrev8.v         W3, W3
+       addi            DATA, DATA, 32
+
+       // Do the 80 rounds of SHA-512.
+       sha512_16rounds 0
+       sha512_16rounds 0
+       sha512_16rounds 0
+       sha512_16rounds 0
+       sha512_16rounds 1
+
+       // Add the previous state.
+       vadd.vv         FEBA, FEBA, PREV_FEBA
+       vadd.vv         HGDC, HGDC, PREV_HGDC
+
+       // Repeat if more blocks remain.
+       bnez            NUM_BLOCKS, .Lnext_block
+
+       // Store the new state and return.
+       vsuxei8.v       FEBA, (STATEP), INDICES
+       vsuxei8.v       HGDC, (STATEP_C), INDICES
+       ret
+SYM_FUNC_END(sha512_transform_zvknhb_zvkb)
+
+.section ".rodata"
+.p2align 3
+.type K512, @object
+K512:
+       .dword          0x428a2f98d728ae22, 0x7137449123ef65cd
+       .dword          0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
+       .dword          0x3956c25bf348b538, 0x59f111f1b605d019
+       .dword          0x923f82a4af194f9b, 0xab1c5ed5da6d8118
+       .dword          0xd807aa98a3030242, 0x12835b0145706fbe
+       .dword          0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
+       .dword          0x72be5d74f27b896f, 0x80deb1fe3b1696b1
+       .dword          0x9bdc06a725c71235, 0xc19bf174cf692694
+       .dword          0xe49b69c19ef14ad2, 0xefbe4786384f25e3
+       .dword          0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
+       .dword          0x2de92c6f592b0275, 0x4a7484aa6ea6e483
+       .dword          0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
+       .dword          0x983e5152ee66dfab, 0xa831c66d2db43210
+       .dword          0xb00327c898fb213f, 0xbf597fc7beef0ee4
+       .dword          0xc6e00bf33da88fc2, 0xd5a79147930aa725
+       .dword          0x06ca6351e003826f, 0x142929670a0e6e70
+       .dword          0x27b70a8546d22ffc, 0x2e1b21385c26c926
+       .dword          0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
+       .dword          0x650a73548baf63de, 0x766a0abb3c77b2a8
+       .dword          0x81c2c92e47edaee6, 0x92722c851482353b
+       .dword          0xa2bfe8a14cf10364, 0xa81a664bbc423001
+       .dword          0xc24b8b70d0f89791, 0xc76c51a30654be30
+       .dword          0xd192e819d6ef5218, 0xd69906245565a910
+       .dword          0xf40e35855771202a, 0x106aa07032bbd1b8
+       .dword          0x19a4c116b8d2d0c8, 0x1e376c085141ab53
+       .dword          0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
+       .dword          0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb
+       .dword          0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
+       .dword          0x748f82ee5defb2fc, 0x78a5636f43172f60
+       .dword          0x84c87814a1f0ab72, 0x8cc702081a6439ec
+       .dword          0x90befffa23631e28, 0xa4506cebde82bde9
+       .dword          0xbef9a3f7b2c67915, 0xc67178f2e372532b
+       .dword          0xca273eceea26619c, 0xd186b8c721c0c207
+       .dword          0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
+       .dword          0x06f067aa72176fba, 0x0a637dc5a2c898a6
+       .dword          0x113f9804bef90dae, 0x1b710b35131c471b
+       .dword          0x28db77f523047d84, 0x32caab7b40c72493
+       .dword          0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
+       .dword          0x4cc5d4becb3e42b6, 0x597f299cfc657e2a
+       .dword          0x5fcb6fab3ad6faec, 0x6c44198c4a475817
+.size K512, . - K512
diff --git a/arch/riscv/crypto/sm3-riscv64-glue.c b/arch/riscv/crypto/sm3-riscv64-glue.c
new file mode 100644 (file)
index 0000000..e1737a9
--- /dev/null
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * SM3 using the RISC-V vector crypto extensions
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sm3_base.h>
+#include <linux/linkage.h>
+#include <linux/module.h>
+
+/*
+ * Note: the asm function only uses the 'state' field of struct sm3_state.
+ * It is assumed to be the first field.
+ */
+asmlinkage void sm3_transform_zvksh_zvkb(
+       struct sm3_state *state, const u8 *data, int num_blocks);
+
+static int riscv64_sm3_update(struct shash_desc *desc, const u8 *data,
+                             unsigned int len)
+{
+       /*
+        * Ensure struct sm3_state begins directly with the SM3
+        * 256-bit internal state, as this is what the asm function expects.
+        */
+       BUILD_BUG_ON(offsetof(struct sm3_state, state) != 0);
+
+       if (crypto_simd_usable()) {
+               kernel_vector_begin();
+               sm3_base_do_update(desc, data, len, sm3_transform_zvksh_zvkb);
+               kernel_vector_end();
+       } else {
+               sm3_update(shash_desc_ctx(desc), data, len);
+       }
+       return 0;
+}
+
+static int riscv64_sm3_finup(struct shash_desc *desc, const u8 *data,
+                            unsigned int len, u8 *out)
+{
+       struct sm3_state *ctx;
+
+       if (crypto_simd_usable()) {
+               kernel_vector_begin();
+               if (len)
+                       sm3_base_do_update(desc, data, len,
+                                          sm3_transform_zvksh_zvkb);
+               sm3_base_do_finalize(desc, sm3_transform_zvksh_zvkb);
+               kernel_vector_end();
+
+               return sm3_base_finish(desc, out);
+       }
+
+       ctx = shash_desc_ctx(desc);
+       if (len)
+               sm3_update(ctx, data, len);
+       sm3_final(ctx, out);
+
+       return 0;
+}
+
+static int riscv64_sm3_final(struct shash_desc *desc, u8 *out)
+{
+       return riscv64_sm3_finup(desc, NULL, 0, out);
+}
+
+static struct shash_alg riscv64_sm3_alg = {
+       .init = sm3_base_init,
+       .update = riscv64_sm3_update,
+       .final = riscv64_sm3_final,
+       .finup = riscv64_sm3_finup,
+       .descsize = sizeof(struct sm3_state),
+       .digestsize = SM3_DIGEST_SIZE,
+       .base = {
+               .cra_blocksize = SM3_BLOCK_SIZE,
+               .cra_priority = 300,
+               .cra_name = "sm3",
+               .cra_driver_name = "sm3-riscv64-zvksh-zvkb",
+               .cra_module = THIS_MODULE,
+       },
+};
+
+static int __init riscv64_sm3_mod_init(void)
+{
+       if (riscv_isa_extension_available(NULL, ZVKSH) &&
+           riscv_isa_extension_available(NULL, ZVKB) &&
+           riscv_vector_vlen() >= 128)
+               return crypto_register_shash(&riscv64_sm3_alg);
+
+       return -ENODEV;
+}
+
+static void __exit riscv64_sm3_mod_exit(void)
+{
+       crypto_unregister_shash(&riscv64_sm3_alg);
+}
+
+module_init(riscv64_sm3_mod_init);
+module_exit(riscv64_sm3_mod_exit);
+
+MODULE_DESCRIPTION("SM3 (RISC-V accelerated)");
+MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("sm3");
diff --git a/arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S b/arch/riscv/crypto/sm3-riscv64-zvksh-zvkb.S
new file mode 100644 (file)
index 0000000..a2b65d9
--- /dev/null
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector SM3 Secure Hash extension ('Zvksh')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/cfi_types.h>
+
+.text
+.option arch, +zvksh, +zvkb
+
+#define STATEP         a0
+#define DATA           a1
+#define NUM_BLOCKS     a2
+
+#define STATE          v0      // LMUL=2
+#define PREV_STATE     v2      // LMUL=2
+#define W0             v4      // LMUL=2
+#define W1             v6      // LMUL=2
+#define VTMP           v8      // LMUL=2
+
+.macro sm3_8rounds     i, w0, w1
+       // Do 4 rounds using W_{0+i}..W_{7+i}.
+       vsm3c.vi        STATE, \w0, \i + 0
+       vslidedown.vi   VTMP, \w0, 2
+       vsm3c.vi        STATE, VTMP, \i + 1
+
+       // Compute W_{4+i}..W_{11+i}.
+       vslidedown.vi   VTMP, \w0, 4
+       vslideup.vi     VTMP, \w1, 4
+
+       // Do 4 rounds using W_{4+i}..W_{11+i}.
+       vsm3c.vi        STATE, VTMP, \i + 2
+       vslidedown.vi   VTMP, VTMP, 2
+       vsm3c.vi        STATE, VTMP, \i + 3
+
+.if \i < 28
+       // Compute W_{16+i}..W_{23+i}.
+       vsm3me.vv       \w0, \w1, \w0
+.endif
+       // For the next 8 rounds, w0 and w1 are swapped.
+.endm
+
+// void sm3_transform_zvksh_zvkb(u32 state[8], const u8 *data, int num_blocks);
+SYM_TYPED_FUNC_START(sm3_transform_zvksh_zvkb)
+
+       // Load the state and endian-swap each 32-bit word.
+       vsetivli        zero, 8, e32, m2, ta, ma
+       vle32.v         STATE, (STATEP)
+       vrev8.v         STATE, STATE
+
+.Lnext_block:
+       addi            NUM_BLOCKS, NUM_BLOCKS, -1
+
+       // Save the previous state, as it's needed later.
+       vmv.v.v         PREV_STATE, STATE
+
+       // Load the next 512-bit message block into W0-W1.
+       vle32.v         W0, (DATA)
+       addi            DATA, DATA, 32
+       vle32.v         W1, (DATA)
+       addi            DATA, DATA, 32
+
+       // Do the 64 rounds of SM3.
+       sm3_8rounds     0, W0, W1
+       sm3_8rounds     4, W1, W0
+       sm3_8rounds     8, W0, W1
+       sm3_8rounds     12, W1, W0
+       sm3_8rounds     16, W0, W1
+       sm3_8rounds     20, W1, W0
+       sm3_8rounds     24, W0, W1
+       sm3_8rounds     28, W1, W0
+
+       // XOR in the previous state.
+       vxor.vv         STATE, STATE, PREV_STATE
+
+       // Repeat if more blocks remain.
+       bnez            NUM_BLOCKS, .Lnext_block
+
+       // Store the new state and return.
+       vrev8.v         STATE, STATE
+       vse32.v         STATE, (STATEP)
+       ret
+SYM_FUNC_END(sm3_transform_zvksh_zvkb)
diff --git a/arch/riscv/crypto/sm4-riscv64-glue.c b/arch/riscv/crypto/sm4-riscv64-glue.c
new file mode 100644 (file)
index 0000000..47fb84e
--- /dev/null
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * SM4 using the RISC-V vector crypto extensions
+ *
+ * Copyright (C) 2023 VRULL GmbH
+ * Author: Heiko Stuebner <heiko.stuebner@vrull.eu>
+ *
+ * Copyright (C) 2023 SiFive, Inc.
+ * Author: Jerry Shih <jerry.shih@sifive.com>
+ */
+
+#include <asm/simd.h>
+#include <asm/vector.h>
+#include <crypto/internal/cipher.h>
+#include <crypto/internal/simd.h>
+#include <crypto/sm4.h>
+#include <linux/linkage.h>
+#include <linux/module.h>
+
+asmlinkage void sm4_expandkey_zvksed_zvkb(const u8 user_key[SM4_KEY_SIZE],
+                                         u32 rkey_enc[SM4_RKEY_WORDS],
+                                         u32 rkey_dec[SM4_RKEY_WORDS]);
+asmlinkage void sm4_crypt_zvksed_zvkb(const u32 rkey[SM4_RKEY_WORDS],
+                                     const u8 in[SM4_BLOCK_SIZE],
+                                     u8 out[SM4_BLOCK_SIZE]);
+
+static int riscv64_sm4_setkey(struct crypto_tfm *tfm, const u8 *key,
+                             unsigned int keylen)
+{
+       struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       if (crypto_simd_usable()) {
+               if (keylen != SM4_KEY_SIZE)
+                       return -EINVAL;
+               kernel_vector_begin();
+               sm4_expandkey_zvksed_zvkb(key, ctx->rkey_enc, ctx->rkey_dec);
+               kernel_vector_end();
+               return 0;
+       }
+       return sm4_expandkey(ctx, key, keylen);
+}
+
+static void riscv64_sm4_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+       const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       if (crypto_simd_usable()) {
+               kernel_vector_begin();
+               sm4_crypt_zvksed_zvkb(ctx->rkey_enc, src, dst);
+               kernel_vector_end();
+       } else {
+               sm4_crypt_block(ctx->rkey_enc, dst, src);
+       }
+}
+
+static void riscv64_sm4_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src)
+{
+       const struct sm4_ctx *ctx = crypto_tfm_ctx(tfm);
+
+       if (crypto_simd_usable()) {
+               kernel_vector_begin();
+               sm4_crypt_zvksed_zvkb(ctx->rkey_dec, src, dst);
+               kernel_vector_end();
+       } else {
+               sm4_crypt_block(ctx->rkey_dec, dst, src);
+       }
+}
+
+static struct crypto_alg riscv64_sm4_alg = {
+       .cra_flags = CRYPTO_ALG_TYPE_CIPHER,
+       .cra_blocksize = SM4_BLOCK_SIZE,
+       .cra_ctxsize = sizeof(struct sm4_ctx),
+       .cra_priority = 300,
+       .cra_name = "sm4",
+       .cra_driver_name = "sm4-riscv64-zvksed-zvkb",
+       .cra_cipher = {
+               .cia_min_keysize = SM4_KEY_SIZE,
+               .cia_max_keysize = SM4_KEY_SIZE,
+               .cia_setkey = riscv64_sm4_setkey,
+               .cia_encrypt = riscv64_sm4_encrypt,
+               .cia_decrypt = riscv64_sm4_decrypt,
+       },
+       .cra_module = THIS_MODULE,
+};
+
+static int __init riscv64_sm4_mod_init(void)
+{
+       if (riscv_isa_extension_available(NULL, ZVKSED) &&
+           riscv_isa_extension_available(NULL, ZVKB) &&
+           riscv_vector_vlen() >= 128)
+               return crypto_register_alg(&riscv64_sm4_alg);
+
+       return -ENODEV;
+}
+
+static void __exit riscv64_sm4_mod_exit(void)
+{
+       crypto_unregister_alg(&riscv64_sm4_alg);
+}
+
+module_init(riscv64_sm4_mod_init);
+module_exit(riscv64_sm4_mod_exit);
+
+MODULE_DESCRIPTION("SM4 (RISC-V accelerated)");
+MODULE_AUTHOR("Heiko Stuebner <heiko.stuebner@vrull.eu>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("sm4");
diff --git a/arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S b/arch/riscv/crypto/sm4-riscv64-zvksed-zvkb.S
new file mode 100644 (file)
index 0000000..fae6217
--- /dev/null
@@ -0,0 +1,117 @@
+/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */
+//
+// This file is dual-licensed, meaning that you can use it under your
+// choice of either of the following two licenses:
+//
+// Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+//
+// Licensed under the Apache License 2.0 (the "License"). You can obtain
+// a copy in the file LICENSE in the source distribution or at
+// https://www.openssl.org/source/license.html
+//
+// or
+//
+// Copyright (c) 2023, Christoph Müllner <christoph.muellner@vrull.eu>
+// Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+// Copyright 2024 Google LLC
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 1. Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions and the following disclaimer in the
+//    documentation and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The generated code of this file depends on the following RISC-V extensions:
+// - RV64I
+// - RISC-V Vector ('V') with VLEN >= 128
+// - RISC-V Vector SM4 Block Cipher extension ('Zvksed')
+// - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+
+#include <linux/linkage.h>
+
+.text
+.option arch, +zvksed, +zvkb
+
+// void sm4_expandkey_zksed_zvkb(const u8 user_key[16], u32 rkey_enc[32],
+//                              u32 rkey_dec[32]);
+SYM_FUNC_START(sm4_expandkey_zvksed_zvkb)
+       vsetivli        zero, 4, e32, m1, ta, ma
+
+       // Load the user key.
+       vle32.v         v1, (a0)
+       vrev8.v         v1, v1
+
+       // XOR the user key with the family key.
+       la              t0, FAMILY_KEY
+       vle32.v         v2, (t0)
+       vxor.vv         v1, v1, v2
+
+       // Compute the round keys.  Store them in forwards order in rkey_enc
+       // and in reverse order in rkey_dec.
+       addi            a2, a2, 31*4
+       li              t0, -4
+       .set            i, 0
+.rept 8
+       vsm4k.vi        v1, v1, i
+       vse32.v         v1, (a1)        // Store to rkey_enc.
+       vsse32.v        v1, (a2), t0    // Store to rkey_dec.
+.if i < 7
+       addi            a1, a1, 16
+       addi            a2, a2, -16
+.endif
+       .set            i, i + 1
+.endr
+
+       ret
+SYM_FUNC_END(sm4_expandkey_zvksed_zvkb)
+
+// void sm4_crypt_zvksed_zvkb(const u32 rkey[32], const u8 in[16], u8 out[16]);
+SYM_FUNC_START(sm4_crypt_zvksed_zvkb)
+       vsetivli        zero, 4, e32, m1, ta, ma
+
+       // Load the input data.
+       vle32.v         v1, (a1)
+       vrev8.v         v1, v1
+
+       // Do the 32 rounds of SM4, 4 at a time.
+       .set            i, 0
+.rept 8
+       vle32.v         v2, (a0)
+       vsm4r.vs        v1, v2
+.if i < 7
+       addi            a0, a0, 16
+.endif
+       .set            i, i + 1
+.endr
+
+       // Store the output data (in reverse element order).
+       vrev8.v         v1, v1
+       li              t0, -4
+       addi            a2, a2, 12
+       vsse32.v        v1, (a2), t0
+
+       ret
+SYM_FUNC_END(sm4_crypt_zvksed_zvkb)
+
+.section ".rodata"
+.p2align 2
+.type FAMILY_KEY, @object
+FAMILY_KEY:
+       .word 0xA3B1BAC6, 0x56AA3350, 0x677D9197, 0xB27022DC
+.size FAMILY_KEY, . - FAMILY_KEY
index 17a90486972468fce5e85c9287f091da14a77e66..f2708a9494a10f0012ef7ac6068a8615b3efbced 100644 (file)
@@ -18,9 +18,9 @@
 #include <asm/sbi.h>
 #include <asm/vendorid_list.h>
 
-#define ANDESTECH_AX45MP_MARCHID       0x8000000000008a45UL
-#define ANDESTECH_AX45MP_MIMPID                0x500UL
-#define ANDESTECH_SBI_EXT_ANDES                0x0900031E
+#define ANDES_AX45MP_MARCHID           0x8000000000008a45UL
+#define ANDES_AX45MP_MIMPID            0x500UL
+#define ANDES_SBI_EXT_ANDES            0x0900031E
 
 #define ANDES_SBI_EXT_IOCP_SW_WORKAROUND       1
 
@@ -32,7 +32,7 @@ static long ax45mp_iocp_sw_workaround(void)
         * ANDES_SBI_EXT_IOCP_SW_WORKAROUND SBI EXT checks if the IOCP is missing and
         * cache is controllable only then CMO will be applied to the platform.
         */
-       ret = sbi_ecall(ANDESTECH_SBI_EXT_ANDES, ANDES_SBI_EXT_IOCP_SW_WORKAROUND,
+       ret = sbi_ecall(ANDES_SBI_EXT_ANDES, ANDES_SBI_EXT_IOCP_SW_WORKAROUND,
                        0, 0, 0, 0, 0, 0);
 
        return ret.error ? 0 : ret.value;
@@ -50,7 +50,7 @@ static void errata_probe_iocp(unsigned int stage, unsigned long arch_id, unsigne
 
        done = true;
 
-       if (arch_id != ANDESTECH_AX45MP_MARCHID || impid != ANDESTECH_AX45MP_MIMPID)
+       if (arch_id != ANDES_AX45MP_MARCHID || impid != ANDES_AX45MP_MIMPID)
                return;
 
        if (!ax45mp_iocp_sw_workaround())
index b0487b39e6747ae384fb51f310bf784759825240..776354895b81e7dc332e58265548aaf7365a6037 100644 (file)
        REG_L x31, PT_T6(sp)
        .endm
 
+/* Annotate a function as being unsuitable for kprobes. */
+#ifdef CONFIG_KPROBES
+#define ASM_NOKPROBE(name)                             \
+       .pushsection "_kprobe_blacklist", "aw";         \
+       RISCV_PTR name;                                 \
+       .popsection
+#else
+#define ASM_NOKPROBE(name)
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_RISCV_ASM_H */
index 9ffc355370248aed22dbe690ba1cde8e682a3588..c4c2173dfe996aeabba1375c8c0f8bec069ec537 100644 (file)
 #include <asm-generic/bitops/fls.h>
 
 #else
+#define __HAVE_ARCH___FFS
+#define __HAVE_ARCH___FLS
+#define __HAVE_ARCH_FFS
+#define __HAVE_ARCH_FLS
+
+#include <asm-generic/bitops/__ffs.h>
+#include <asm-generic/bitops/__fls.h>
+#include <asm-generic/bitops/ffs.h>
+#include <asm-generic/bitops/fls.h>
+
 #include <asm/alternative-macros.h>
 #include <asm/hwcap.h>
 
@@ -37,8 +47,6 @@
 
 static __always_inline unsigned long variable__ffs(unsigned long word)
 {
-       int num;
-
        asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
                                      RISCV_ISA_EXT_ZBB, 1)
                          : : : : legacy);
@@ -52,32 +60,7 @@ static __always_inline unsigned long variable__ffs(unsigned long word)
        return word;
 
 legacy:
-       num = 0;
-#if BITS_PER_LONG == 64
-       if ((word & 0xffffffff) == 0) {
-               num += 32;
-               word >>= 32;
-       }
-#endif
-       if ((word & 0xffff) == 0) {
-               num += 16;
-               word >>= 16;
-       }
-       if ((word & 0xff) == 0) {
-               num += 8;
-               word >>= 8;
-       }
-       if ((word & 0xf) == 0) {
-               num += 4;
-               word >>= 4;
-       }
-       if ((word & 0x3) == 0) {
-               num += 2;
-               word >>= 2;
-       }
-       if ((word & 0x1) == 0)
-               num += 1;
-       return num;
+       return generic___ffs(word);
 }
 
 /**
@@ -93,8 +76,6 @@ legacy:
 
 static __always_inline unsigned long variable__fls(unsigned long word)
 {
-       int num;
-
        asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
                                      RISCV_ISA_EXT_ZBB, 1)
                          : : : : legacy);
@@ -108,32 +89,7 @@ static __always_inline unsigned long variable__fls(unsigned long word)
        return BITS_PER_LONG - 1 - word;
 
 legacy:
-       num = BITS_PER_LONG - 1;
-#if BITS_PER_LONG == 64
-       if (!(word & (~0ul << 32))) {
-               num -= 32;
-               word <<= 32;
-       }
-#endif
-       if (!(word & (~0ul << (BITS_PER_LONG - 16)))) {
-               num -= 16;
-               word <<= 16;
-       }
-       if (!(word & (~0ul << (BITS_PER_LONG - 8)))) {
-               num -= 8;
-               word <<= 8;
-       }
-       if (!(word & (~0ul << (BITS_PER_LONG - 4)))) {
-               num -= 4;
-               word <<= 4;
-       }
-       if (!(word & (~0ul << (BITS_PER_LONG - 2)))) {
-               num -= 2;
-               word <<= 2;
-       }
-       if (!(word & (~0ul << (BITS_PER_LONG - 1))))
-               num -= 1;
-       return num;
+       return generic___fls(word);
 }
 
 /**
@@ -149,46 +105,23 @@ legacy:
 
 static __always_inline int variable_ffs(int x)
 {
-       int r;
-
-       if (!x)
-               return 0;
-
        asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
                                      RISCV_ISA_EXT_ZBB, 1)
                          : : : : legacy);
 
+       if (!x)
+               return 0;
+
        asm volatile (".option push\n"
                      ".option arch,+zbb\n"
                      CTZW "%0, %1\n"
                      ".option pop\n"
-                     : "=r" (r) : "r" (x) :);
+                     : "=r" (x) : "r" (x) :);
 
-       return r + 1;
+       return x + 1;
 
 legacy:
-       r = 1;
-       if (!(x & 0xffff)) {
-               x >>= 16;
-               r += 16;
-       }
-       if (!(x & 0xff)) {
-               x >>= 8;
-               r += 8;
-       }
-       if (!(x & 0xf)) {
-               x >>= 4;
-               r += 4;
-       }
-       if (!(x & 3)) {
-               x >>= 2;
-               r += 2;
-       }
-       if (!(x & 1)) {
-               x >>= 1;
-               r += 1;
-       }
-       return r;
+       return generic_ffs(x);
 }
 
 /**
@@ -204,46 +137,23 @@ legacy:
 
 static __always_inline int variable_fls(unsigned int x)
 {
-       int r;
-
-       if (!x)
-               return 0;
-
        asm_volatile_goto(ALTERNATIVE("j %l[legacy]", "nop", 0,
                                      RISCV_ISA_EXT_ZBB, 1)
                          : : : : legacy);
 
+       if (!x)
+               return 0;
+
        asm volatile (".option push\n"
                      ".option arch,+zbb\n"
                      CLZW "%0, %1\n"
                      ".option pop\n"
-                     : "=r" (r) : "r" (x) :);
+                     : "=r" (x) : "r" (x) :);
 
-       return 32 - r;
+       return 32 - x;
 
 legacy:
-       r = 32;
-       if (!(x & 0xffff0000u)) {
-               x <<= 16;
-               r -= 16;
-       }
-       if (!(x & 0xff000000u)) {
-               x <<= 8;
-               r -= 8;
-       }
-       if (!(x & 0xf0000000u)) {
-               x <<= 4;
-               r -= 4;
-       }
-       if (!(x & 0xc0000000u)) {
-               x <<= 2;
-               r -= 2;
-       }
-       if (!(x & 0x80000000u)) {
-               x <<= 1;
-               r -= 1;
-       }
-       return r;
+       return generic_fls(x);
 }
 
 /**
index 2ac955b51148f4f12d23c03b42aae76d79b79bed..aa103530a5c83a501ac00c0e00c5e15f8e6a59a9 100644 (file)
 
 static inline int is_compat_task(void)
 {
+       if (!IS_ENABLED(CONFIG_COMPAT))
+               return 0;
+
        return test_thread_flag(TIF_32BIT);
 }
 
+static inline int is_compat_thread(struct thread_info *thread)
+{
+       if (!IS_ENABLED(CONFIG_COMPAT))
+               return 0;
+
+       return test_ti_thread_flag(thread, TIF_32BIT);
+}
+
+static inline void set_compat_task(bool is_compat)
+{
+       if (is_compat)
+               set_thread_flag(TIF_32BIT);
+       else
+               clear_thread_flag(TIF_32BIT);
+}
+
 struct compat_user_regs_struct {
        compat_ulong_t pc;
        compat_ulong_t ra;
index 5a626ed2c47a8915b3848df2e7f4a7ea0601bd71..46061f5e976439d0e9c9c5a223fedde10d5a6958 100644 (file)
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 /*
- * Copyright 2022-2023 Rivos, Inc
+ * Copyright 2022-2024 Rivos, Inc
  */
 
 #ifndef _ASM_CPUFEATURE_H
@@ -28,29 +28,38 @@ struct riscv_isainfo {
 
 DECLARE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo);
 
-DECLARE_PER_CPU(long, misaligned_access_speed);
-
 /* Per-cpu ISA extensions. */
 extern struct riscv_isainfo hart_isa[NR_CPUS];
 
 void riscv_user_isa_enable(void);
 
-#ifdef CONFIG_RISCV_MISALIGNED
-bool unaligned_ctl_available(void);
-bool check_unaligned_access_emulated(int cpu);
+#if defined(CONFIG_RISCV_MISALIGNED)
+bool check_unaligned_access_emulated_all_cpus(void);
 void unaligned_emulation_finish(void);
+bool unaligned_ctl_available(void);
+DECLARE_PER_CPU(long, misaligned_access_speed);
 #else
 static inline bool unaligned_ctl_available(void)
 {
        return false;
 }
+#endif
+
+#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
+DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
 
-static inline bool check_unaligned_access_emulated(int cpu)
+static __always_inline bool has_fast_unaligned_accesses(void)
 {
-       return false;
+       return static_branch_likely(&fast_unaligned_access_speed_key);
+}
+#else
+static __always_inline bool has_fast_unaligned_accesses(void)
+{
+       if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
+               return true;
+       else
+               return false;
 }
-
-static inline void unaligned_emulation_finish(void) {}
 #endif
 
 unsigned long riscv_get_elf_hwcap(void);
@@ -135,6 +144,4 @@ static __always_inline bool riscv_cpu_has_extension_unlikely(int cpu, const unsi
        return __riscv_isa_extension_available(hart_isa[cpu].isa, ext);
 }
 
-DECLARE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
-
 #endif
index 06c236bfab53b323491ce6ae3bbdbbbcd6206318..c7aea7886d22aaede04dba348c38779f139c31ee 100644 (file)
@@ -53,13 +53,9 @@ extern bool compat_elf_check_arch(Elf32_Ehdr *hdr);
 #define ELF_ET_DYN_BASE                ((DEFAULT_MAP_WINDOW / 3) * 2)
 
 #ifdef CONFIG_64BIT
-#ifdef CONFIG_COMPAT
-#define STACK_RND_MASK         (test_thread_flag(TIF_32BIT) ? \
+#define STACK_RND_MASK         (is_compat_task() ? \
                                 0x7ff >> (PAGE_SHIFT - 12) : \
                                 0x3ffff >> (PAGE_SHIFT - 12))
-#else
-#define STACK_RND_MASK         (0x3ffff >> (PAGE_SHIFT - 12))
-#endif
 #endif
 
 /*
@@ -139,10 +135,7 @@ do {                                                       \
 #ifdef CONFIG_COMPAT
 
 #define SET_PERSONALITY(ex)                                    \
-do {    if ((ex).e_ident[EI_CLASS] == ELFCLASS32)              \
-               set_thread_flag(TIF_32BIT);                     \
-       else                                                    \
-               clear_thread_flag(TIF_32BIT);                   \
+do {   set_compat_task((ex).e_ident[EI_CLASS] == ELFCLASS32);  \
        if (personality(current->personality) != PER_LINUX32)   \
                set_personality(PER_LINUX |                     \
                        (current->personality & (~PER_MASK)));  \
index ea33288f8a25b4f76e59bd65e8f869ee842c6e14..1f2dbfb8a8bfc8c9f5d46b9129c26756941c4918 100644 (file)
@@ -12,8 +12,8 @@
 #include <asm/vendorid_list.h>
 
 #ifdef CONFIG_ERRATA_ANDES
-#define ERRATA_ANDESTECH_NO_IOCP       0
-#define ERRATA_ANDESTECH_NUMBER                1
+#define ERRATA_ANDES_NO_IOCP 0
+#define ERRATA_ANDES_NUMBER 1
 #endif
 
 #ifdef CONFIG_ERRATA_SIFIVE
@@ -112,15 +112,6 @@ asm volatile(ALTERNATIVE(                                          \
 #define THEAD_C9XX_RV_IRQ_PMU                  17
 #define THEAD_C9XX_CSR_SCOUNTEROF              0x5c5
 
-#define ALT_SBI_PMU_OVERFLOW(__ovl)                                    \
-asm volatile(ALTERNATIVE(                                              \
-       "csrr %0, " __stringify(CSR_SSCOUNTOVF),                        \
-       "csrr %0, " __stringify(THEAD_C9XX_CSR_SCOUNTEROF),             \
-               THEAD_VENDOR_ID, ERRATA_THEAD_PMU,                      \
-               CONFIG_ERRATA_THEAD_PMU)                                \
-       : "=r" (__ovl) :                                                \
-       : "memory")
-
 #endif /* __ASSEMBLY__ */
 
 #endif
index 5340f818746b71a805319eb6f941fa311c9b36a2..bae7eac76c180c3b988dd277319207d11892a4a1 100644 (file)
@@ -80,6 +80,7 @@
 #define RISCV_ISA_EXT_ZFA              71
 #define RISCV_ISA_EXT_ZTSO             72
 #define RISCV_ISA_EXT_ZACAS            73
+#define RISCV_ISA_EXT_XANDESPMU                74
 
 #define RISCV_ISA_EXT_MAX              128
 #define RISCV_ISA_EXT_INVALID          U32_MAX
diff --git a/arch/riscv/include/asm/membarrier.h b/arch/riscv/include/asm/membarrier.h
new file mode 100644 (file)
index 0000000..47b240d
--- /dev/null
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef _ASM_RISCV_MEMBARRIER_H
+#define _ASM_RISCV_MEMBARRIER_H
+
+static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
+                                            struct mm_struct *next,
+                                            struct task_struct *tsk)
+{
+       /*
+        * Only need the full barrier when switching between processes.
+        * Barrier when switching from kernel to userspace is not
+        * required here, given that it is implied by mmdrop(). Barrier
+        * when switching from userspace to kernel is not needed after
+        * store to rq->curr.
+        */
+       if (IS_ENABLED(CONFIG_SMP) &&
+           likely(!(atomic_read(&next->membarrier_state) &
+                    (MEMBARRIER_STATE_PRIVATE_EXPEDITED |
+                     MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev))
+               return;
+
+       /*
+        * The membarrier system call requires a full memory barrier
+        * after storing to rq->curr, before going back to user-space.
+        *
+        * This barrier is also needed for the SYNC_CORE command when
+        * switching between processes; in particular, on a transition
+        * from a thread belonging to another mm to a thread belonging
+        * to the mm for which a membarrier SYNC_CORE is done on CPU0:
+        *
+        *   - [CPU0] sets all bits in the mm icache_stale_mask (in
+        *     prepare_sync_core_cmd());
+        *
+        *   - [CPU1] stores to rq->curr (by the scheduler);
+        *
+        *   - [CPU0] loads rq->curr within membarrier and observes
+        *     cpu_rq(1)->curr->mm != mm, so the IPI is skipped on
+        *     CPU1; this means membarrier relies on switch_mm() to
+        *     issue the sync-core;
+        *
+        *   - [CPU1] switch_mm() loads icache_stale_mask; if the bit
+        *     is zero, switch_mm() may incorrectly skip the sync-core.
+        *
+        * Matches a full barrier in the proximity of the membarrier
+        * system call entry.
+        */
+       smp_mb();
+}
+
+#endif /* _ASM_RISCV_MEMBARRIER_H */
index d169a4f41a2e728276a97898e1270c7b4763f9ed..deaf971253a20102685aff042b10f8edbf7e614f 100644 (file)
@@ -95,7 +95,19 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
                __pud_free(mm, pud);
 }
 
-#define __pud_free_tlb(tlb, pud, addr)  pud_free((tlb)->mm, pud)
+static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
+                                 unsigned long addr)
+{
+       if (pgtable_l4_enabled) {
+               struct ptdesc *ptdesc = virt_to_ptdesc(pud);
+
+               pagetable_pud_dtor(ptdesc);
+               if (riscv_use_ipi_for_rfence())
+                       tlb_remove_page_ptdesc(tlb, ptdesc);
+               else
+                       tlb_remove_ptdesc(tlb, ptdesc);
+       }
+}
 
 #define p4d_alloc_one p4d_alloc_one
 static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -124,7 +136,16 @@ static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
                __p4d_free(mm, p4d);
 }
 
-#define __p4d_free_tlb(tlb, p4d, addr)  p4d_free((tlb)->mm, p4d)
+static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
+                                 unsigned long addr)
+{
+       if (pgtable_l5_enabled) {
+               if (riscv_use_ipi_for_rfence())
+                       tlb_remove_page_ptdesc(tlb, virt_to_ptdesc(p4d));
+               else
+                       tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d));
+       }
+}
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 static inline void sync_kernel_mappings(pgd_t *pgd)
@@ -149,15 +170,31 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 
 #ifndef __PAGETABLE_PMD_FOLDED
 
-#define __pmd_free_tlb(tlb, pmd, addr)  pmd_free((tlb)->mm, pmd)
+static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
+                                 unsigned long addr)
+{
+       struct ptdesc *ptdesc = virt_to_ptdesc(pmd);
+
+       pagetable_pmd_dtor(ptdesc);
+       if (riscv_use_ipi_for_rfence())
+               tlb_remove_page_ptdesc(tlb, ptdesc);
+       else
+               tlb_remove_ptdesc(tlb, ptdesc);
+}
 
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-#define __pte_free_tlb(tlb, pte, buf)                  \
-do {                                                   \
-       pagetable_pte_dtor(page_ptdesc(pte));           \
-       tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\
-} while (0)
+static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
+                                 unsigned long addr)
+{
+       struct ptdesc *ptdesc = page_ptdesc(pte);
+
+       pagetable_pte_dtor(ptdesc);
+       if (riscv_use_ipi_for_rfence())
+               tlb_remove_page_ptdesc(tlb, ptdesc);
+       else
+               tlb_remove_ptdesc(tlb, ptdesc);
+}
 #endif /* CONFIG_MMU */
 
 #endif /* _ASM_RISCV_PGALLOC_H */
index 0c94260b5d0c126f6302f39a59507f19eed48dac..b2e6965748f214d83531d1d31e315c0c26682c4f 100644 (file)
 #define VA_USER_SV48 (UL(1) << (VA_BITS_SV48 - 1))
 #define VA_USER_SV57 (UL(1) << (VA_BITS_SV57 - 1))
 
-#ifdef CONFIG_COMPAT
 #define MMAP_VA_BITS_64 ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS)
 #define MMAP_MIN_VA_BITS_64 (VA_BITS_SV39)
 #define MMAP_VA_BITS (is_compat_task() ? VA_BITS_SV32 : MMAP_VA_BITS_64)
 #define MMAP_MIN_VA_BITS (is_compat_task() ? VA_BITS_SV32 : MMAP_MIN_VA_BITS_64)
-#else
-#define MMAP_VA_BITS ((VA_BITS >= VA_BITS_SV48) ? VA_BITS_SV48 : VA_BITS)
-#define MMAP_MIN_VA_BITS (VA_BITS_SV39)
-#endif /* CONFIG_COMPAT */
-
 #else
 #include <asm/pgtable-32.h>
 #endif /* CONFIG_64BIT */
@@ -439,6 +433,12 @@ static inline pte_t pte_mkhuge(pte_t pte)
        return pte;
 }
 
+#ifdef CONFIG_RISCV_ISA_SVNAPOT
+#define pte_leaf_size(pte)     (pte_napot(pte) ?                               \
+                                       napot_cont_size(napot_cont_order(pte)) :\
+                                       PAGE_SIZE)
+#endif
+
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * See the comment in include/asm-generic/pgtable.h
@@ -513,12 +513,12 @@ static inline void set_pte(pte_t *ptep, pte_t pteval)
        WRITE_ONCE(*ptep, pteval);
 }
 
-void flush_icache_pte(pte_t pte);
+void flush_icache_pte(struct mm_struct *mm, pte_t pte);
 
-static inline void __set_pte_at(pte_t *ptep, pte_t pteval)
+static inline void __set_pte_at(struct mm_struct *mm, pte_t *ptep, pte_t pteval)
 {
        if (pte_present(pteval) && pte_exec(pteval))
-               flush_icache_pte(pteval);
+               flush_icache_pte(mm, pteval);
 
        set_pte(ptep, pteval);
 }
@@ -529,7 +529,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
        page_table_check_ptes_set(mm, ptep, pteval, nr);
 
        for (;;) {
-               __set_pte_at(ptep, pteval);
+               __set_pte_at(mm, ptep, pteval);
                if (--nr == 0)
                        break;
                ptep++;
@@ -541,7 +541,7 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 static inline void pte_clear(struct mm_struct *mm,
        unsigned long addr, pte_t *ptep)
 {
-       __set_pte_at(ptep, __pte(0));
+       __set_pte_at(mm, ptep, __pte(0));
 }
 
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS      /* defined in mm/pgtable.c */
@@ -656,6 +656,12 @@ static inline int pmd_write(pmd_t pmd)
        return pte_write(pmd_pte(pmd));
 }
 
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+       return pte_write(pud_pte(pud));
+}
+
 #define pmd_dirty pmd_dirty
 static inline int pmd_dirty(pmd_t pmd)
 {
@@ -707,14 +713,14 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                                pmd_t *pmdp, pmd_t pmd)
 {
        page_table_check_pmd_set(mm, pmdp, pmd);
-       return __set_pte_at((pte_t *)pmdp, pmd_pte(pmd));
+       return __set_pte_at(mm, (pte_t *)pmdp, pmd_pte(pmd));
 }
 
 static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
                                pud_t *pudp, pud_t pud)
 {
        page_table_check_pud_set(mm, pudp, pud);
-       return __set_pte_at((pte_t *)pudp, pud_pte(pud));
+       return __set_pte_at(mm, (pte_t *)pudp, pud_pte(pud));
 }
 
 #ifdef CONFIG_PAGE_TABLE_CHECK
@@ -865,8 +871,8 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
 #define TASK_SIZE_MIN  (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
 
 #ifdef CONFIG_COMPAT
-#define TASK_SIZE_32   (_AC(0x80000000, UL))
-#define TASK_SIZE      (test_thread_flag(TIF_32BIT) ? \
+#define TASK_SIZE_32   (_AC(0x80000000, UL) - PAGE_SIZE)
+#define TASK_SIZE      (is_compat_task() ? \
                         TASK_SIZE_32 : TASK_SIZE_64)
 #else
 #define TASK_SIZE      TASK_SIZE_64
index a8509cc31ab25a5dcc75765bdb99e43e87dded3b..0faf5f161f1e4957b4cbeac63cf4d69c4dbd55e6 100644 (file)
 
 #include <asm/ptrace.h>
 
-#ifdef CONFIG_64BIT
-#define DEFAULT_MAP_WINDOW     (UL(1) << (MMAP_VA_BITS - 1))
-#define STACK_TOP_MAX          TASK_SIZE
-
+/*
+ * addr is a hint to the maximum userspace address that mmap should provide, so
+ * this macro needs to return the largest address space available so that
+ * mmap_end < addr, being mmap_end the top of that address space.
+ * See Documentation/arch/riscv/vm-layout.rst for more details.
+ */
 #define arch_get_mmap_end(addr, len, flags)                    \
 ({                                                             \
        unsigned long mmap_end;                                 \
        typeof(addr) _addr = (addr);                            \
-       if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \
-               mmap_end = STACK_TOP_MAX;                       \
-       else if ((_addr) >= VA_USER_SV57)                       \
+       if ((_addr) == 0 || is_compat_task() ||                 \
+           ((_addr + len) > BIT(VA_BITS - 1)))                 \
                mmap_end = STACK_TOP_MAX;                       \
-       else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
-               mmap_end = VA_USER_SV48;                        \
        else                                                    \
-               mmap_end = VA_USER_SV39;                        \
+               mmap_end = (_addr + len);                       \
        mmap_end;                                               \
 })
 
        typeof(addr) _addr = (addr);                            \
        typeof(base) _base = (base);                            \
        unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base);   \
-       if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) && is_compat_task())) \
+       if ((_addr) == 0 || is_compat_task() ||                 \
+           ((_addr + len) > BIT(VA_BITS - 1)))                 \
                mmap_base = (_base);                            \
-       else if (((_addr) >= VA_USER_SV57) && (VA_BITS >= VA_BITS_SV57)) \
-               mmap_base = VA_USER_SV57 - rnd_gap;             \
-       else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >= VA_BITS_SV48)) \
-               mmap_base = VA_USER_SV48 - rnd_gap;             \
        else                                                    \
-               mmap_base = VA_USER_SV39 - rnd_gap;             \
+               mmap_base = (_addr + len) - rnd_gap;            \
        mmap_base;                                              \
 })
 
+#ifdef CONFIG_64BIT
+#define DEFAULT_MAP_WINDOW     (UL(1) << (MMAP_VA_BITS - 1))
+#define STACK_TOP_MAX          TASK_SIZE_64
 #else
 #define DEFAULT_MAP_WINDOW     TASK_SIZE
 #define STACK_TOP_MAX          TASK_SIZE
index 54efbf523d49c67d75921c7f8454efc87ad0f257..adb50f3ec2057ba02252a5d016c00a3b102059a1 100644 (file)
@@ -34,9 +34,9 @@ static __must_check inline bool may_use_simd(void)
                return false;
 
        /*
-        * Nesting is acheived in preempt_v by spreading the control for
+        * Nesting is achieved in preempt_v by spreading the control for
         * preemptible and non-preemptible kernel-mode Vector into two fields.
-        * Always try to match with prempt_v if kernel V-context exists. Then,
+        * Always try to match with preempt_v if kernel V-context exists. Then,
         * fallback to check non preempt_v if nesting happens, or if the config
         * is not set.
         */
index 02f87867389a9e660f91b64c7ca818a6b61637dc..076f8a9437cf5af85defbc506c79dd8b5e47438d 100644 (file)
@@ -55,4 +55,7 @@ int hibernate_resume_nonboot_cpu_disable(void);
 asmlinkage void hibernate_restore_image(unsigned long resume_satp, unsigned long satp_temp,
                                        unsigned long cpu_resume);
 asmlinkage int hibernate_core_restore_code(void);
+bool riscv_sbi_hsm_is_supported(void);
+bool riscv_sbi_suspend_state_is_valid(u32 state);
+int riscv_sbi_hart_suspend(u32 state);
 #endif
diff --git a/arch/riscv/include/asm/sync_core.h b/arch/riscv/include/asm/sync_core.h
new file mode 100644 (file)
index 0000000..9153016
--- /dev/null
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_RISCV_SYNC_CORE_H
+#define _ASM_RISCV_SYNC_CORE_H
+
+/*
+ * RISC-V implements return to user-space through an xRET instruction,
+ * which is not core serializing.
+ */
+static inline void sync_core_before_usermode(void)
+{
+       asm volatile ("fence.i" ::: "memory");
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Ensure the next switch_mm() on every CPU issues a core serializing
+ * instruction for the given @mm.
+ */
+static inline void prepare_sync_core_cmd(struct mm_struct *mm)
+{
+       cpumask_setall(&mm->context.icache_stale_mask);
+}
+#else
+static inline void prepare_sync_core_cmd(struct mm_struct *mm)
+{
+}
+#endif /* CONFIG_SMP */
+
+#endif /* _ASM_RISCV_SYNC_CORE_H */
index 1eb5682b2af6065c9019e398df729f5b97a573c6..a0b8b853503fe7994e1c95410f5613a9b40fefb4 100644 (file)
@@ -10,6 +10,24 @@ struct mmu_gather;
 
 static void tlb_flush(struct mmu_gather *tlb);
 
+#ifdef CONFIG_MMU
+#include <linux/swap.h>
+
+/*
+ * While riscv platforms with riscv_ipi_for_rfence as true require an IPI to
+ * perform TLB shootdown, some platforms with riscv_ipi_for_rfence as false use
+ * SBI to perform TLB shootdown. To keep software pagetable walkers safe in this
+ * case we switch to RCU based table free (MMU_GATHER_RCU_TABLE_FREE). See the
+ * comment below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
+ * for more details.
+ */
+static inline void __tlb_remove_table(void *table)
+{
+       free_page_and_swap_cache(table);
+}
+
+#endif /* CONFIG_MMU */
+
 #define tlb_flush tlb_flush
 #include <asm-generic/tlb.h>
 
index 0cd6f0a027d1f7ae7bb95b509bad3400c9fa71a5..731dcd0ed4de92ac7a00b1e1c534b2c33d905d2d 100644 (file)
@@ -284,4 +284,15 @@ static inline bool riscv_v_vstate_ctrl_user_allowed(void) { return false; }
 
 #endif /* CONFIG_RISCV_ISA_V */
 
+/*
+ * Return the implementation's vlen value.
+ *
+ * riscv_v_vsize contains the value of "32 vector registers with vlenb length"
+ * so rebuild the vlen value in bits from it.
+ */
+static inline int riscv_vector_vlen(void)
+{
+       return riscv_v_vsize / 32 * 8;
+}
+
 #endif /* ! __ASM_RISCV_VECTOR_H */
index e55407ace0c3617631df5c887338bf7e43727e7f..2f2bb0c84f9a71f4350e6794c913c9b9bee442e4 100644 (file)
@@ -5,7 +5,7 @@
 #ifndef ASM_VENDOR_LIST_H
 #define ASM_VENDOR_LIST_H
 
-#define ANDESTECH_VENDOR_ID    0x31e
+#define ANDES_VENDOR_ID                0x31e
 #define SIFIVE_VENDOR_ID       0x489
 #define THEAD_VENDOR_ID                0x5b7
 
index 924d01b56c9a1eb1eacd53a923fc55591cda654f..51f6dfe19745aa486bd73d7de472faa538cf0486 100644 (file)
@@ -19,65 +19,6 @@ static inline bool arch_vmap_pmd_supported(pgprot_t prot)
        return true;
 }
 
-#ifdef CONFIG_RISCV_ISA_SVNAPOT
-#include <linux/pgtable.h>
+#endif
 
-#define arch_vmap_pte_range_map_size arch_vmap_pte_range_map_size
-static inline unsigned long arch_vmap_pte_range_map_size(unsigned long addr, unsigned long end,
-                                                        u64 pfn, unsigned int max_page_shift)
-{
-       unsigned long map_size = PAGE_SIZE;
-       unsigned long size, order;
-
-       if (!has_svnapot())
-               return map_size;
-
-       for_each_napot_order_rev(order) {
-               if (napot_cont_shift(order) > max_page_shift)
-                       continue;
-
-               size = napot_cont_size(order);
-               if (end - addr < size)
-                       continue;
-
-               if (!IS_ALIGNED(addr, size))
-                       continue;
-
-               if (!IS_ALIGNED(PFN_PHYS(pfn), size))
-                       continue;
-
-               map_size = size;
-               break;
-       }
-
-       return map_size;
-}
-
-#define arch_vmap_pte_supported_shift arch_vmap_pte_supported_shift
-static inline int arch_vmap_pte_supported_shift(unsigned long size)
-{
-       int shift = PAGE_SHIFT;
-       unsigned long order;
-
-       if (!has_svnapot())
-               return shift;
-
-       WARN_ON_ONCE(size >= PMD_SIZE);
-
-       for_each_napot_order_rev(order) {
-               if (napot_cont_size(order) > size)
-                       continue;
-
-               if (!IS_ALIGNED(size, napot_cont_size(order)))
-                       continue;
-
-               shift = napot_cont_shift(order);
-               break;
-       }
-
-       return shift;
-}
-
-#endif /* CONFIG_RISCV_ISA_SVNAPOT */
-#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
 #endif /* _ASM_RISCV_VMALLOC_H */
index f71910718053d841a361fd97e7d62da4f86bebcf..c8085126a6f98969ec27ab46af8ca8dfe77a3bf4 100644 (file)
@@ -38,7 +38,6 @@ extra-y += vmlinux.lds
 obj-y  += head.o
 obj-y  += soc.o
 obj-$(CONFIG_RISCV_ALTERNATIVE) += alternative.o
-obj-y  += copy-unaligned.o
 obj-y  += cpu.o
 obj-y  += cpufeature.o
 obj-y  += entry.o
@@ -62,6 +61,9 @@ obj-y += tests/
 obj-$(CONFIG_MMU) += vdso.o vdso/
 
 obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
+obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o
+obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)     += copy-unaligned.o
+
 obj-$(CONFIG_FPU)              += fpu.o
 obj-$(CONFIG_RISCV_ISA_V)      += vector.o
 obj-$(CONFIG_RISCV_ISA_V)      += kernel_mode_vector.o
index 319a1da0358b4924b706d1eaff6dc89343d6e54d..0128b161bfdab2d88377b99386490e8a4f3571d8 100644 (file)
@@ -43,7 +43,7 @@ static void riscv_fill_cpu_mfr_info(struct cpu_manufacturer_info_t *cpu_mfr_info
 
        switch (cpu_mfr_info->vendor_id) {
 #ifdef CONFIG_ERRATA_ANDES
-       case ANDESTECH_VENDOR_ID:
+       case ANDES_VENDOR_ID:
                cpu_mfr_info->patch_func = andes_errata_patch_func;
                break;
 #endif
index 89920f84d0a34385471e9afbf9c26d287cbbd838..afeae3ff43dc1f880594708444121b823ba8d7ce 100644 (file)
@@ -11,7 +11,6 @@
 #include <linux/cpu.h>
 #include <linux/cpuhotplug.h>
 #include <linux/ctype.h>
-#include <linux/jump_label.h>
 #include <linux/log2.h>
 #include <linux/memory.h>
 #include <linux/module.h>
 #include <asm/cacheflush.h>
 #include <asm/cpufeature.h>
 #include <asm/hwcap.h>
-#include <asm/hwprobe.h>
 #include <asm/patch.h>
 #include <asm/processor.h>
 #include <asm/vector.h>
 
-#include "copy-unaligned.h"
-
 #define NUM_ALPHA_EXTS ('z' - 'a' + 1)
 
-#define MISALIGNED_ACCESS_JIFFIES_LG2 1
-#define MISALIGNED_BUFFER_SIZE 0x4000
-#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
-#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
-
 unsigned long elf_hwcap __read_mostly;
 
 /* Host ISA bitmap */
@@ -43,11 +34,6 @@ static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly;
 /* Per-cpu ISA extensions. */
 struct riscv_isainfo hart_isa[NR_CPUS];
 
-/* Performance information */
-DEFINE_PER_CPU(long, misaligned_access_speed);
-
-static cpumask_t fast_misaligned_access;
-
 /**
  * riscv_isa_extension_base() - Get base extension word
  *
@@ -307,6 +293,7 @@ const struct riscv_isa_ext_data riscv_isa_ext[] = {
        __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
        __RISCV_ISA_EXT_DATA(svnapot, RISCV_ISA_EXT_SVNAPOT),
        __RISCV_ISA_EXT_DATA(svpbmt, RISCV_ISA_EXT_SVPBMT),
+       __RISCV_ISA_EXT_DATA(xandespmu, RISCV_ISA_EXT_XANDESPMU),
 };
 
 const size_t riscv_isa_ext_count = ARRAY_SIZE(riscv_isa_ext);
@@ -706,247 +693,6 @@ unsigned long riscv_get_elf_hwcap(void)
        return hwcap;
 }
 
-static int check_unaligned_access(void *param)
-{
-       int cpu = smp_processor_id();
-       u64 start_cycles, end_cycles;
-       u64 word_cycles;
-       u64 byte_cycles;
-       int ratio;
-       unsigned long start_jiffies, now;
-       struct page *page = param;
-       void *dst;
-       void *src;
-       long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
-
-       if (check_unaligned_access_emulated(cpu))
-               return 0;
-
-       /* Make an unaligned destination buffer. */
-       dst = (void *)((unsigned long)page_address(page) | 0x1);
-       /* Unalign src as well, but differently (off by 1 + 2 = 3). */
-       src = dst + (MISALIGNED_BUFFER_SIZE / 2);
-       src += 2;
-       word_cycles = -1ULL;
-       /* Do a warmup. */
-       __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
-       preempt_disable();
-       start_jiffies = jiffies;
-       while ((now = jiffies) == start_jiffies)
-               cpu_relax();
-
-       /*
-        * For a fixed amount of time, repeatedly try the function, and take
-        * the best time in cycles as the measurement.
-        */
-       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
-               start_cycles = get_cycles64();
-               /* Ensure the CSR read can't reorder WRT to the copy. */
-               mb();
-               __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
-               /* Ensure the copy ends before the end time is snapped. */
-               mb();
-               end_cycles = get_cycles64();
-               if ((end_cycles - start_cycles) < word_cycles)
-                       word_cycles = end_cycles - start_cycles;
-       }
-
-       byte_cycles = -1ULL;
-       __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
-       start_jiffies = jiffies;
-       while ((now = jiffies) == start_jiffies)
-               cpu_relax();
-
-       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
-               start_cycles = get_cycles64();
-               mb();
-               __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
-               mb();
-               end_cycles = get_cycles64();
-               if ((end_cycles - start_cycles) < byte_cycles)
-                       byte_cycles = end_cycles - start_cycles;
-       }
-
-       preempt_enable();
-
-       /* Don't divide by zero. */
-       if (!word_cycles || !byte_cycles) {
-               pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
-                       cpu);
-
-               return 0;
-       }
-
-       if (word_cycles < byte_cycles)
-               speed = RISCV_HWPROBE_MISALIGNED_FAST;
-
-       ratio = div_u64((byte_cycles * 100), word_cycles);
-       pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
-               cpu,
-               ratio / 100,
-               ratio % 100,
-               (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
-
-       per_cpu(misaligned_access_speed, cpu) = speed;
-
-       /*
-        * Set the value of fast_misaligned_access of a CPU. These operations
-        * are atomic to avoid race conditions.
-        */
-       if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
-               cpumask_set_cpu(cpu, &fast_misaligned_access);
-       else
-               cpumask_clear_cpu(cpu, &fast_misaligned_access);
-
-       return 0;
-}
-
-static void check_unaligned_access_nonboot_cpu(void *param)
-{
-       unsigned int cpu = smp_processor_id();
-       struct page **pages = param;
-
-       if (smp_processor_id() != 0)
-               check_unaligned_access(pages[cpu]);
-}
-
-DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
-
-static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
-{
-       if (cpumask_weight(mask) == weight)
-               static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key);
-       else
-               static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key);
-}
-
-static void set_unaligned_access_static_branches_except_cpu(int cpu)
-{
-       /*
-        * Same as set_unaligned_access_static_branches, except excludes the
-        * given CPU from the result. When a CPU is hotplugged into an offline
-        * state, this function is called before the CPU is set to offline in
-        * the cpumask, and thus the CPU needs to be explicitly excluded.
-        */
-
-       cpumask_t fast_except_me;
-
-       cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
-       cpumask_clear_cpu(cpu, &fast_except_me);
-
-       modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
-}
-
-static void set_unaligned_access_static_branches(void)
-{
-       /*
-        * This will be called after check_unaligned_access_all_cpus so the
-        * result of unaligned access speed for all CPUs will be available.
-        *
-        * To avoid the number of online cpus changing between reading
-        * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
-        * held before calling this function.
-        */
-
-       cpumask_t fast_and_online;
-
-       cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
-
-       modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
-}
-
-static int lock_and_set_unaligned_access_static_branch(void)
-{
-       cpus_read_lock();
-       set_unaligned_access_static_branches();
-       cpus_read_unlock();
-
-       return 0;
-}
-
-arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
-
-static int riscv_online_cpu(unsigned int cpu)
-{
-       static struct page *buf;
-
-       /* We are already set since the last check */
-       if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
-               goto exit;
-
-       buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
-       if (!buf) {
-               pr_warn("Allocation failure, not measuring misaligned performance\n");
-               return -ENOMEM;
-       }
-
-       check_unaligned_access(buf);
-       __free_pages(buf, MISALIGNED_BUFFER_ORDER);
-
-exit:
-       set_unaligned_access_static_branches();
-
-       return 0;
-}
-
-static int riscv_offline_cpu(unsigned int cpu)
-{
-       set_unaligned_access_static_branches_except_cpu(cpu);
-
-       return 0;
-}
-
-/* Measure unaligned access on all CPUs present at boot in parallel. */
-static int check_unaligned_access_all_cpus(void)
-{
-       unsigned int cpu;
-       unsigned int cpu_count = num_possible_cpus();
-       struct page **bufs = kzalloc(cpu_count * sizeof(struct page *),
-                                    GFP_KERNEL);
-
-       if (!bufs) {
-               pr_warn("Allocation failure, not measuring misaligned performance\n");
-               return 0;
-       }
-
-       /*
-        * Allocate separate buffers for each CPU so there's no fighting over
-        * cache lines.
-        */
-       for_each_cpu(cpu, cpu_online_mask) {
-               bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
-               if (!bufs[cpu]) {
-                       pr_warn("Allocation failure, not measuring misaligned performance\n");
-                       goto out;
-               }
-       }
-
-       /* Check everybody except 0, who stays behind to tend jiffies. */
-       on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
-
-       /* Check core 0. */
-       smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
-
-       /*
-        * Setup hotplug callbacks for any new CPUs that come online or go
-        * offline.
-        */
-       cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
-                                 riscv_online_cpu, riscv_offline_cpu);
-
-out:
-       unaligned_emulation_finish();
-       for_each_cpu(cpu, cpu_online_mask) {
-               if (bufs[cpu])
-                       __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
-       }
-
-       kfree(bufs);
-       return 0;
-}
-
-arch_initcall(check_unaligned_access_all_cpus);
-
 void riscv_user_isa_enable(void)
 {
        if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICBOZ))
index 9d1a305d55087bb3a6bdc73f8ed8ebe3206775b1..68a24cf9481afe0649e671bd764fa688307c23bf 100644 (file)
@@ -111,6 +111,7 @@ SYM_CODE_START(handle_exception)
 1:
        tail do_trap_unknown
 SYM_CODE_END(handle_exception)
+ASM_NOKPROBE(handle_exception)
 
 /*
  * The ret_from_exception must be called with interrupt disabled. Here is the
@@ -184,6 +185,7 @@ SYM_CODE_START_NOALIGN(ret_from_exception)
        sret
 #endif
 SYM_CODE_END(ret_from_exception)
+ASM_NOKPROBE(ret_from_exception)
 
 #ifdef CONFIG_VMAP_STACK
 SYM_CODE_START_LOCAL(handle_kernel_stack_overflow)
@@ -219,6 +221,7 @@ SYM_CODE_START_LOCAL(handle_kernel_stack_overflow)
        move a0, sp
        tail handle_bad_stack
 SYM_CODE_END(handle_kernel_stack_overflow)
+ASM_NOKPROBE(handle_kernel_stack_overflow)
 #endif
 
 SYM_CODE_START(ret_from_fork)
index 07915dc9279e9219fc2c7abc54ba749d4a48597e..b75f150b923d68c189e5017a9ddf0f171d099687 100644 (file)
@@ -9,6 +9,9 @@ KBUILD_CFLAGS   := $(subst $(CC_FLAGS_FTRACE),,$(KBUILD_CFLAGS)) -fpie \
                   -fno-asynchronous-unwind-tables -fno-unwind-tables \
                   $(call cc-option,-fno-addrsig)
 
+# Disable LTO
+KBUILD_CFLAGS  := $(filter-out $(CC_FLAGS_LTO), $(KBUILD_CFLAGS))
+
 KBUILD_CFLAGS  += -mcmodel=medany
 
 CFLAGS_cmdline_early.o += -D__NO_FORTIFY
index e8515aa9d80bf82fd6ff2598664b9fe18a6b1de3..92731ff8c79ad02b6f3db9375b84635d3ea13f41 100644 (file)
@@ -377,14 +377,14 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 
        return ret;
 }
+#else
+static const struct user_regset_view compat_riscv_user_native_view = {};
 #endif /* CONFIG_COMPAT */
 
 const struct user_regset_view *task_user_regset_view(struct task_struct *task)
 {
-#ifdef CONFIG_COMPAT
-       if (test_tsk_thread_flag(task, TIF_32BIT))
+       if (is_compat_thread(&task->thread_info))
                return &compat_riscv_user_native_view;
        else
-#endif
                return &riscv_user_native_view;
 }
index 519b6bd946e5d1b69edf3379e31b345e38a03deb..cfbe4b840d422d5735ad7ebcd89f2e8efd8dc0a4 100644 (file)
@@ -28,7 +28,6 @@
 
 #include <asm/cpufeature.h>
 #include <asm/cpu_ops.h>
-#include <asm/cpufeature.h>
 #include <asm/irq.h>
 #include <asm/mmu_context.h>
 #include <asm/numa.h>
index 239509367e4233336806c19da964a06537d5a9b5..b20f2cb5879f13863edd2bd76d95c8955192f39c 100644 (file)
@@ -128,4 +128,53 @@ static int __init sbi_system_suspend_init(void)
 }
 
 arch_initcall(sbi_system_suspend_init);
+
+static int sbi_suspend_finisher(unsigned long suspend_type,
+                               unsigned long resume_addr,
+                               unsigned long opaque)
+{
+       struct sbiret ret;
+
+       ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND,
+                       suspend_type, resume_addr, opaque, 0, 0, 0);
+
+       return (ret.error) ? sbi_err_map_linux_errno(ret.error) : 0;
+}
+
+int riscv_sbi_hart_suspend(u32 state)
+{
+       if (state & SBI_HSM_SUSP_NON_RET_BIT)
+               return cpu_suspend(state, sbi_suspend_finisher);
+       else
+               return sbi_suspend_finisher(state, 0, 0);
+}
+
+bool riscv_sbi_suspend_state_is_valid(u32 state)
+{
+       if (state > SBI_HSM_SUSPEND_RET_DEFAULT &&
+           state < SBI_HSM_SUSPEND_RET_PLATFORM)
+               return false;
+
+       if (state > SBI_HSM_SUSPEND_NON_RET_DEFAULT &&
+           state < SBI_HSM_SUSPEND_NON_RET_PLATFORM)
+               return false;
+
+       return true;
+}
+
+bool riscv_sbi_hsm_is_supported(void)
+{
+       /*
+        * The SBI HSM suspend function is only available when:
+        * 1) SBI version is 0.3 or higher
+        * 2) SBI HSM extension is available
+        */
+       if (sbi_spec_version < sbi_mk_version(0, 3) ||
+           !sbi_probe_extension(SBI_EXT_HSM)) {
+               pr_info("HSM suspend not available\n");
+               return false;
+       }
+
+       return true;
+}
 #endif /* CONFIG_RISCV_SBI */
index a7c56b41efd24d826a9baaed7575c3508e49a9de..8cae41a502dd4a9e9c3a23c3a63d998c3e9de2d3 100644 (file)
@@ -147,6 +147,7 @@ static bool hwprobe_ext0_has(const struct cpumask *cpus, unsigned long ext)
        return (pair.value & ext);
 }
 
+#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
 static u64 hwprobe_misaligned(const struct cpumask *cpus)
 {
        int cpu;
@@ -169,6 +170,18 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
 
        return perf;
 }
+#else
+static u64 hwprobe_misaligned(const struct cpumask *cpus)
+{
+       if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS))
+               return RISCV_HWPROBE_MISALIGNED_FAST;
+
+       if (IS_ENABLED(CONFIG_RISCV_EMULATED_UNALIGNED_ACCESS) && unaligned_ctl_available())
+               return RISCV_HWPROBE_MISALIGNED_EMULATED;
+
+       return RISCV_HWPROBE_MISALIGNED_SLOW;
+}
+#endif
 
 static void hwprobe_one_pair(struct riscv_hwprobe *pair,
                             const struct cpumask *cpus)
index a1b9be3c4332d97f08b50beebfcadba5adaa02be..868d6280cf667e655de2d5003c2fd57d129b3127 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/cpu.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/randomize_kstack.h>
 #include <linux/sched.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/signal.h>
@@ -310,7 +311,8 @@ asmlinkage __visible __trap_section void do_trap_break(struct pt_regs *regs)
        }
 }
 
-asmlinkage __visible __trap_section void do_trap_ecall_u(struct pt_regs *regs)
+asmlinkage __visible __trap_section  __no_stack_protector
+void do_trap_ecall_u(struct pt_regs *regs)
 {
        if (user_mode(regs)) {
                long syscall = regs->a7;
@@ -322,10 +324,23 @@ asmlinkage __visible __trap_section void do_trap_ecall_u(struct pt_regs *regs)
 
                syscall = syscall_enter_from_user_mode(regs, syscall);
 
+               add_random_kstack_offset();
+
                if (syscall >= 0 && syscall < NR_syscalls)
                        syscall_handler(regs, syscall);
                else if (syscall != -1)
                        regs->a0 = -ENOSYS;
+               /*
+                * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(),
+                * so the maximum stack offset is 1k bytes (10 bits).
+                *
+                * The actual entropy will be further reduced by the compiler when
+                * applying stack alignment constraints: 16-byte (i.e. 4-bit) aligned
+                * for RV32I or RV64I.
+                *
+                * The resulting 6 bits of entropy is seen in SP[9:4].
+                */
+               choose_random_kstack_offset(get_random_u16());
 
                syscall_exit_to_user_mode(regs);
        } else {
index 8ded225e8c5b1313d800c8f87878212c48c9b250..2adb7c3e4dd5bfc3fceb34a2e71bd915e5658674 100644 (file)
@@ -413,7 +413,9 @@ int handle_misaligned_load(struct pt_regs *regs)
 
        perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr);
 
+#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
        *this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_EMULATED;
+#endif
 
        if (!unaligned_enabled)
                return -1;
@@ -596,7 +598,7 @@ int handle_misaligned_store(struct pt_regs *regs)
        return 0;
 }
 
-bool check_unaligned_access_emulated(int cpu)
+static bool check_unaligned_access_emulated(int cpu)
 {
        long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu);
        unsigned long tmp_var, tmp_val;
@@ -623,7 +625,7 @@ bool check_unaligned_access_emulated(int cpu)
        return misaligned_emu_detected;
 }
 
-void unaligned_emulation_finish(void)
+bool check_unaligned_access_emulated_all_cpus(void)
 {
        int cpu;
 
@@ -632,13 +634,12 @@ void unaligned_emulation_finish(void)
         * accesses emulated since tasks requesting such control can run on any
         * CPU.
         */
-       for_each_present_cpu(cpu) {
-               if (per_cpu(misaligned_access_speed, cpu) !=
-                                       RISCV_HWPROBE_MISALIGNED_EMULATED) {
-                       return;
-               }
-       }
+       for_each_online_cpu(cpu)
+               if (!check_unaligned_access_emulated(cpu))
+                       return false;
+
        unaligned_ctl = true;
+       return true;
 }
 
 bool unaligned_ctl_available(void)
diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
new file mode 100644 (file)
index 0000000..a9a6bcb
--- /dev/null
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2024 Rivos Inc.
+ */
+
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/jump_label.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/types.h>
+#include <asm/cpufeature.h>
+#include <asm/hwprobe.h>
+
+#include "copy-unaligned.h"
+
+#define MISALIGNED_ACCESS_JIFFIES_LG2 1
+#define MISALIGNED_BUFFER_SIZE 0x4000
+#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
+#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
+
+DEFINE_PER_CPU(long, misaligned_access_speed);
+
+#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
+static cpumask_t fast_misaligned_access;
+static int check_unaligned_access(void *param)
+{
+       int cpu = smp_processor_id();
+       u64 start_cycles, end_cycles;
+       u64 word_cycles;
+       u64 byte_cycles;
+       int ratio;
+       unsigned long start_jiffies, now;
+       struct page *page = param;
+       void *dst;
+       void *src;
+       long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
+
+       if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
+               return 0;
+
+       /* Make an unaligned destination buffer. */
+       dst = (void *)((unsigned long)page_address(page) | 0x1);
+       /* Unalign src as well, but differently (off by 1 + 2 = 3). */
+       src = dst + (MISALIGNED_BUFFER_SIZE / 2);
+       src += 2;
+       word_cycles = -1ULL;
+       /* Do a warmup. */
+       __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+       preempt_disable();
+       start_jiffies = jiffies;
+       while ((now = jiffies) == start_jiffies)
+               cpu_relax();
+
+       /*
+        * For a fixed amount of time, repeatedly try the function, and take
+        * the best time in cycles as the measurement.
+        */
+       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
+               start_cycles = get_cycles64();
+               /* Ensure the CSR read can't reorder WRT to the copy. */
+               mb();
+               __riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+               /* Ensure the copy ends before the end time is snapped. */
+               mb();
+               end_cycles = get_cycles64();
+               if ((end_cycles - start_cycles) < word_cycles)
+                       word_cycles = end_cycles - start_cycles;
+       }
+
+       byte_cycles = -1ULL;
+       __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+       start_jiffies = jiffies;
+       while ((now = jiffies) == start_jiffies)
+               cpu_relax();
+
+       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
+               start_cycles = get_cycles64();
+               mb();
+               __riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+               mb();
+               end_cycles = get_cycles64();
+               if ((end_cycles - start_cycles) < byte_cycles)
+                       byte_cycles = end_cycles - start_cycles;
+       }
+
+       preempt_enable();
+
+       /* Don't divide by zero. */
+       if (!word_cycles || !byte_cycles) {
+               pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
+                       cpu);
+
+               return 0;
+       }
+
+       if (word_cycles < byte_cycles)
+               speed = RISCV_HWPROBE_MISALIGNED_FAST;
+
+       ratio = div_u64((byte_cycles * 100), word_cycles);
+       pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
+               cpu,
+               ratio / 100,
+               ratio % 100,
+               (speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
+
+       per_cpu(misaligned_access_speed, cpu) = speed;
+
+       /*
+        * Set the value of fast_misaligned_access of a CPU. These operations
+        * are atomic to avoid race conditions.
+        */
+       if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
+               cpumask_set_cpu(cpu, &fast_misaligned_access);
+       else
+               cpumask_clear_cpu(cpu, &fast_misaligned_access);
+
+       return 0;
+}
+
+static void check_unaligned_access_nonboot_cpu(void *param)
+{
+       unsigned int cpu = smp_processor_id();
+       struct page **pages = param;
+
+       if (smp_processor_id() != 0)
+               check_unaligned_access(pages[cpu]);
+}
+
+DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
+
+static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
+{
+       if (cpumask_weight(mask) == weight)
+               static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
+       else
+               static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
+}
+
+static void set_unaligned_access_static_branches_except_cpu(int cpu)
+{
+       /*
+        * Same as set_unaligned_access_static_branches, except excludes the
+        * given CPU from the result. When a CPU is hotplugged into an offline
+        * state, this function is called before the CPU is set to offline in
+        * the cpumask, and thus the CPU needs to be explicitly excluded.
+        */
+
+       cpumask_t fast_except_me;
+
+       cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
+       cpumask_clear_cpu(cpu, &fast_except_me);
+
+       modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
+}
+
+static void set_unaligned_access_static_branches(void)
+{
+       /*
+        * This will be called after check_unaligned_access_all_cpus so the
+        * result of unaligned access speed for all CPUs will be available.
+        *
+        * To avoid the number of online cpus changing between reading
+        * cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
+        * held before calling this function.
+        */
+
+       cpumask_t fast_and_online;
+
+       cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
+
+       modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
+}
+
+static int lock_and_set_unaligned_access_static_branch(void)
+{
+       cpus_read_lock();
+       set_unaligned_access_static_branches();
+       cpus_read_unlock();
+
+       return 0;
+}
+
+arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
+
+static int riscv_online_cpu(unsigned int cpu)
+{
+       static struct page *buf;
+
+       /* We are already set since the last check */
+       if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
+               goto exit;
+
+       buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
+       if (!buf) {
+               pr_warn("Allocation failure, not measuring misaligned performance\n");
+               return -ENOMEM;
+       }
+
+       check_unaligned_access(buf);
+       __free_pages(buf, MISALIGNED_BUFFER_ORDER);
+
+exit:
+       set_unaligned_access_static_branches();
+
+       return 0;
+}
+
+static int riscv_offline_cpu(unsigned int cpu)
+{
+       set_unaligned_access_static_branches_except_cpu(cpu);
+
+       return 0;
+}
+
+/* Measure unaligned access speed on all CPUs present at boot in parallel. */
+static int check_unaligned_access_speed_all_cpus(void)
+{
+       unsigned int cpu;
+       unsigned int cpu_count = num_possible_cpus();
+       struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
+
+       if (!bufs) {
+               pr_warn("Allocation failure, not measuring misaligned performance\n");
+               return 0;
+       }
+
+       /*
+        * Allocate separate buffers for each CPU so there's no fighting over
+        * cache lines.
+        */
+       for_each_cpu(cpu, cpu_online_mask) {
+               bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
+               if (!bufs[cpu]) {
+                       pr_warn("Allocation failure, not measuring misaligned performance\n");
+                       goto out;
+               }
+       }
+
+       /* Check everybody except 0, who stays behind to tend jiffies. */
+       on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
+
+       /* Check core 0. */
+       smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
+
+       /*
+        * Setup hotplug callbacks for any new CPUs that come online or go
+        * offline.
+        */
+       cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
+                                 riscv_online_cpu, riscv_offline_cpu);
+
+out:
+       for_each_cpu(cpu, cpu_online_mask) {
+               if (bufs[cpu])
+                       __free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
+       }
+
+       kfree(bufs);
+       return 0;
+}
+
+static int check_unaligned_access_all_cpus(void)
+{
+       bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
+
+       if (!all_cpus_emulated)
+               return check_unaligned_access_speed_all_cpus();
+
+       return 0;
+}
+#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
+static int check_unaligned_access_all_cpus(void)
+{
+       check_unaligned_access_emulated_all_cpus();
+
+       return 0;
+}
+#endif
+
+arch_initcall(check_unaligned_access_all_cpus);
index af3df5274ccbae0118488080040f45881a3e025a..7178e0acfa22841da893fad4ae4079ccf350e44d 100644 (file)
@@ -3,7 +3,7 @@
  * Checksum library
  *
  * Influenced by arch/arm64/lib/csum.c
- * Copyright (C) 2023 Rivos Inc.
+ * Copyright (C) 2023-2024 Rivos Inc.
  */
 #include <linux/bitops.h>
 #include <linux/compiler.h>
@@ -318,10 +318,7 @@ unsigned int do_csum(const unsigned char *buff, int len)
         * branches. The largest chunk of overlap was delegated into the
         * do_csum_common function.
         */
-       if (static_branch_likely(&fast_misaligned_access_speed_key))
-               return do_csum_no_alignment(buff, len);
-
-       if (((unsigned long)buff & OFFSET_MASK) == 0)
+       if (has_fast_unaligned_accesses() || (((unsigned long)buff & OFFSET_MASK) == 0))
                return do_csum_no_alignment(buff, len);
 
        return do_csum_with_alignment(buff, len);
index 51ab5588e9ff36b8b7dc80096be587d09da2881f..7c45f26de4f79b75f5235e3bdb64658aa6bafbce 100644 (file)
@@ -1,7 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0-only */
 
 #include <linux/linkage.h>
-#include <asm-generic/export.h>
 #include <asm/asm.h>
 #include <asm/asm-extable.h>
 #include <asm/csr.h>
index 55a34f2020a85a895932c92d94a7577bf410f8dc..bc61ee5975e4124ffc91ca52fd7a852ee9412597 100644 (file)
@@ -82,12 +82,12 @@ void flush_icache_mm(struct mm_struct *mm, bool local)
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_MMU
-void flush_icache_pte(pte_t pte)
+void flush_icache_pte(struct mm_struct *mm, pte_t pte)
 {
        struct folio *folio = page_folio(pte_page(pte));
 
        if (!test_bit(PG_dcache_clean, &folio->flags)) {
-               flush_icache_all();
+               flush_icache_mm(mm, false);
                set_bit(PG_dcache_clean, &folio->flags);
        }
 }
index 217fd4de6134224655db81759f0b35c73948e46d..ba8eb3944687cfd445770c357b32cb4fa0e37564 100644 (file)
@@ -323,6 +323,8 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
        if (unlikely(prev == next))
                return;
 
+       membarrier_arch_switch_mm(prev, next, task);
+
        /*
         * Mark the current MM context as inactive, and the next as
         * active.  This is at least used by the icache flushing
index 32cad6a65ccd23431d63097a0906ca5b8de485f8..c55915554836f1d17225952f5e53148defd216d5 100644 (file)
@@ -767,6 +767,11 @@ static int __init print_no5lvl(char *p)
 }
 early_param("no5lvl", print_no5lvl);
 
+static void __init set_mmap_rnd_bits_max(void)
+{
+       mmap_rnd_bits_max = MMAP_VA_BITS - PAGE_SHIFT - 3;
+}
+
 /*
  * There is a simple way to determine if 4-level is supported by the
  * underlying hardware: establish 1:1 mapping in 4-level page table mode
@@ -1081,6 +1086,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
 
 #if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL)
        set_satp_mode(dtb_pa);
+       set_mmap_rnd_bits_max();
 #endif
 
        /*
index ef887efcb67900d94b97e603225f3c4d088fea9b..533ec9055fa0da75597bce9619d24f18c58577ac 100644 (file)
@@ -10,7 +10,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
                          pte_t entry, int dirty)
 {
        if (!pte_same(ptep_get(ptep), entry))
-               __set_pte_at(ptep, entry);
+               __set_pte_at(vma->vm_mm, ptep, entry);
        /*
         * update_mmu_cache will unconditionally execute, handling both
         * the case that the PTE changed and the spurious fault case.
index 7d156c75f15f2d3465d9041500bdbdb3dcb3d1ee..00e4aa16bf2bf7490cf7ab854c3ccf90d06450c3 100644 (file)
@@ -1496,6 +1496,9 @@ endif
 if PPC
 source "arch/powerpc/crypto/Kconfig"
 endif
+if RISCV
+source "arch/riscv/crypto/Kconfig"
+endif
 if S390
 source "arch/s390/crypto/Kconfig"
 endif
index 3c3f8037ebedddbbfa756f52c98fd209053559c7..1606eb622a9ffe2d7015724778035307d5aae1fd 100644 (file)
@@ -286,7 +286,7 @@ config ACPI_CPPC_LIB
 
 config ACPI_PROCESSOR
        tristate "Processor"
-       depends on X86 || ARM64 || LOONGARCH
+       depends on X86 || ARM64 || LOONGARCH || RISCV
        select ACPI_PROCESSOR_IDLE
        select ACPI_CPU_FREQ_PSS if X86 || LOONGARCH
        select THERMAL
index 8b3b126e0b940bd1492c759c6f267acd6d0f1dc6..86b0925f612d98b91f4801591768dc88aa9ae88d 100644 (file)
@@ -1,2 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-y  += rhct.o
+obj-y                                  += rhct.o
+obj-$(CONFIG_ACPI_PROCESSOR_IDLE)      += cpuidle.o
+obj-$(CONFIG_ACPI_CPPC_LIB)            += cppc.o
diff --git a/drivers/acpi/riscv/cppc.c b/drivers/acpi/riscv/cppc.c
new file mode 100644 (file)
index 0000000..4cdff38
--- /dev/null
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Implement CPPC FFH helper routines for RISC-V.
+ *
+ * Copyright (C) 2024 Ventana Micro Systems Inc.
+ */
+
+#include <acpi/cppc_acpi.h>
+#include <asm/csr.h>
+#include <asm/sbi.h>
+
+#define SBI_EXT_CPPC 0x43505043
+
+/* CPPC interfaces defined in SBI spec */
+#define SBI_CPPC_PROBE                 0x0
+#define SBI_CPPC_READ                  0x1
+#define SBI_CPPC_READ_HI               0x2
+#define SBI_CPPC_WRITE                 0x3
+
+/* RISC-V FFH definitions from RISC-V FFH spec */
+#define FFH_CPPC_TYPE(r)               (((r) & GENMASK_ULL(63, 60)) >> 60)
+#define FFH_CPPC_SBI_REG(r)            ((r) & GENMASK(31, 0))
+#define FFH_CPPC_CSR_NUM(r)            ((r) & GENMASK(11, 0))
+
+#define FFH_CPPC_SBI                   0x1
+#define FFH_CPPC_CSR                   0x2
+
+struct sbi_cppc_data {
+       u64 val;
+       u32 reg;
+       struct sbiret ret;
+};
+
+static bool cppc_ext_present;
+
+static int __init sbi_cppc_init(void)
+{
+       if (sbi_spec_version >= sbi_mk_version(2, 0) &&
+           sbi_probe_extension(SBI_EXT_CPPC) > 0) {
+               pr_info("SBI CPPC extension detected\n");
+               cppc_ext_present = true;
+       } else {
+               pr_info("SBI CPPC extension NOT detected!!\n");
+               cppc_ext_present = false;
+       }
+
+       return 0;
+}
+device_initcall(sbi_cppc_init);
+
+static void sbi_cppc_read(void *read_data)
+{
+       struct sbi_cppc_data *data = (struct sbi_cppc_data *)read_data;
+
+       data->ret = sbi_ecall(SBI_EXT_CPPC, SBI_CPPC_READ,
+                             data->reg, 0, 0, 0, 0, 0);
+}
+
+static void sbi_cppc_write(void *write_data)
+{
+       struct sbi_cppc_data *data = (struct sbi_cppc_data *)write_data;
+
+       data->ret = sbi_ecall(SBI_EXT_CPPC, SBI_CPPC_WRITE,
+                             data->reg, data->val, 0, 0, 0, 0);
+}
+
+static void cppc_ffh_csr_read(void *read_data)
+{
+       struct sbi_cppc_data *data = (struct sbi_cppc_data *)read_data;
+
+       switch (data->reg) {
+       /* Support only TIME CSR for now */
+       case CSR_TIME:
+               data->ret.value = csr_read(CSR_TIME);
+               data->ret.error = 0;
+               break;
+       default:
+               data->ret.error = -EINVAL;
+               break;
+       }
+}
+
+static void cppc_ffh_csr_write(void *write_data)
+{
+       struct sbi_cppc_data *data = (struct sbi_cppc_data *)write_data;
+
+       data->ret.error = -EINVAL;
+}
+
+/*
+ * Refer to drivers/acpi/cppc_acpi.c for the description of the functions
+ * below.
+ */
+bool cpc_ffh_supported(void)
+{
+       return true;
+}
+
+int cpc_read_ffh(int cpu, struct cpc_reg *reg, u64 *val)
+{
+       struct sbi_cppc_data data;
+
+       if (WARN_ON_ONCE(irqs_disabled()))
+               return -EPERM;
+
+       if (FFH_CPPC_TYPE(reg->address) == FFH_CPPC_SBI) {
+               if (!cppc_ext_present)
+                       return -EINVAL;
+
+               data.reg = FFH_CPPC_SBI_REG(reg->address);
+
+               smp_call_function_single(cpu, sbi_cppc_read, &data, 1);
+
+               *val = data.ret.value;
+
+               return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0;
+       } else if (FFH_CPPC_TYPE(reg->address) == FFH_CPPC_CSR) {
+               data.reg = FFH_CPPC_CSR_NUM(reg->address);
+
+               smp_call_function_single(cpu, cppc_ffh_csr_read, &data, 1);
+
+               *val = data.ret.value;
+
+               return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0;
+       }
+
+       return -EINVAL;
+}
+
+int cpc_write_ffh(int cpu, struct cpc_reg *reg, u64 val)
+{
+       struct sbi_cppc_data data;
+
+       if (WARN_ON_ONCE(irqs_disabled()))
+               return -EPERM;
+
+       if (FFH_CPPC_TYPE(reg->address) == FFH_CPPC_SBI) {
+               if (!cppc_ext_present)
+                       return -EINVAL;
+
+               data.reg = FFH_CPPC_SBI_REG(reg->address);
+               data.val = val;
+
+               smp_call_function_single(cpu, sbi_cppc_write, &data, 1);
+
+               return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0;
+       } else if (FFH_CPPC_TYPE(reg->address) == FFH_CPPC_CSR) {
+               data.reg = FFH_CPPC_CSR_NUM(reg->address);
+               data.val = val;
+
+               smp_call_function_single(cpu, cppc_ffh_csr_write, &data, 1);
+
+               return (data.ret.error) ? sbi_err_map_linux_errno(data.ret.error) : 0;
+       }
+
+       return -EINVAL;
+}
diff --git a/drivers/acpi/riscv/cpuidle.c b/drivers/acpi/riscv/cpuidle.c
new file mode 100644 (file)
index 0000000..624f9bb
--- /dev/null
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024, Ventana Micro Systems Inc
+ *     Author: Sunil V L <sunilvl@ventanamicro.com>
+ *
+ */
+
+#include <linux/acpi.h>
+#include <acpi/processor.h>
+#include <linux/cpu_pm.h>
+#include <linux/cpuidle.h>
+#include <linux/suspend.h>
+#include <asm/cpuidle.h>
+#include <asm/sbi.h>
+#include <asm/suspend.h>
+
+#define RISCV_FFH_LPI_TYPE_MASK        GENMASK_ULL(63, 60)
+#define RISCV_FFH_LPI_RSVD_MASK        GENMASK_ULL(59, 32)
+
+#define RISCV_FFH_LPI_TYPE_SBI BIT_ULL(60)
+
+static int acpi_cpu_init_idle(unsigned int cpu)
+{
+       int i;
+       struct acpi_lpi_state *lpi;
+       struct acpi_processor *pr = per_cpu(processors, cpu);
+
+       if (unlikely(!pr || !pr->flags.has_lpi))
+               return -EINVAL;
+
+       if (!riscv_sbi_hsm_is_supported())
+               return -ENODEV;
+
+       if (pr->power.count <= 1)
+               return -ENODEV;
+
+       for (i = 1; i < pr->power.count; i++) {
+               u32 state;
+
+               lpi = &pr->power.lpi_states[i];
+
+               /*
+                * Validate Entry Method as per FFH spec.
+                * bits[63:60] should be 0x1
+                * bits[59:32] should be 0x0
+                * bits[31:0] represent a SBI power_state
+                */
+               if (((lpi->address & RISCV_FFH_LPI_TYPE_MASK) != RISCV_FFH_LPI_TYPE_SBI) ||
+                   (lpi->address & RISCV_FFH_LPI_RSVD_MASK)) {
+                       pr_warn("Invalid LPI entry method %#llx\n", lpi->address);
+                       return -EINVAL;
+               }
+
+               state = lpi->address;
+               if (!riscv_sbi_suspend_state_is_valid(state)) {
+                       pr_warn("Invalid SBI power state %#x\n", state);
+                       return -EINVAL;
+               }
+       }
+
+       return 0;
+}
+
+int acpi_processor_ffh_lpi_probe(unsigned int cpu)
+{
+       return acpi_cpu_init_idle(cpu);
+}
+
+int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi)
+{
+       u32 state = lpi->address;
+
+       if (state & SBI_HSM_SUSP_NON_RET_BIT)
+               return CPU_PM_CPU_IDLE_ENTER_PARAM(riscv_sbi_hart_suspend,
+                                                  lpi->index,
+                                                  state);
+       else
+               return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(riscv_sbi_hart_suspend,
+                                                            lpi->index,
+                                                            state);
+}
index 9a55e733ae995dc6d734e9eb14dee295cd59b230..09fd292eb83df06d5ba32ec7e1c1671e42be5217 100644 (file)
@@ -131,7 +131,7 @@ static int clint_timer_starting_cpu(unsigned int cpu)
        struct clock_event_device *ce = per_cpu_ptr(&clint_clock_event, cpu);
 
        ce->cpumask = cpumask_of(cpu);
-       clockevents_config_and_register(ce, clint_timer_freq, 100, 0x7fffffff);
+       clockevents_config_and_register(ce, clint_timer_freq, 100, ULONG_MAX);
 
        enable_percpu_irq(clint_timer_irq,
                          irq_get_trigger_type(clint_timer_irq));
index e66dcbd6656658dd32f913189fceebda87e8ccbf..87a7ac0ce6cec46e4b00fa06a90402f4e2ced9de 100644 (file)
@@ -114,7 +114,7 @@ static int riscv_timer_starting_cpu(unsigned int cpu)
                ce->features |= CLOCK_EVT_FEAT_C3STOP;
        if (static_branch_likely(&riscv_sstc_available))
                ce->rating = 450;
-       clockevents_config_and_register(ce, riscv_timebase, 100, 0x7fffffff);
+       clockevents_config_and_register(ce, riscv_timebase, 100, ULONG_MAX);
 
        enable_percpu_irq(riscv_clock_event_irq,
                          irq_get_trigger_type(riscv_clock_event_irq));
index 35efb53d5492a3c87cca82790345d202f10993dc..94e55c40970a6cf94e7356896ec427e93bd3000e 100644 (file)
@@ -302,4 +302,33 @@ config QORIQ_CPUFREQ
          which are capable of changing the CPU's frequency dynamically.
 
 endif
+
+config ACPI_CPPC_CPUFREQ
+       tristate "CPUFreq driver based on the ACPI CPPC spec"
+       depends on ACPI_PROCESSOR
+       depends on ARM || ARM64 || RISCV
+       select ACPI_CPPC_LIB
+       help
+         This adds a CPUFreq driver which uses CPPC methods
+         as described in the ACPIv5.1 spec. CPPC stands for
+         Collaborative Processor Performance Controls. It
+         is based on an abstract continuous scale of CPU
+         performance values which allows the remote power
+         processor to flexibly optimize for power and
+         performance. CPPC relies on power management firmware
+         support for its operation.
+
+         If in doubt, say N.
+
+config ACPI_CPPC_CPUFREQ_FIE
+       bool "Frequency Invariance support for CPPC cpufreq driver"
+       depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY
+       depends on ARM || ARM64 || RISCV
+       default y
+       help
+         This extends frequency invariance support in the CPPC cpufreq driver,
+         by using CPPC delivered and reference performance counters.
+
+         If in doubt, say N.
+
 endmenu
index f911606897b8d1dcdb23e7dfdafc012f0879211e..987b3d900a89b809967f200fc002ded5bf0b2715 100644 (file)
@@ -3,32 +3,6 @@
 # ARM CPU Frequency scaling drivers
 #
 
-config ACPI_CPPC_CPUFREQ
-       tristate "CPUFreq driver based on the ACPI CPPC spec"
-       depends on ACPI_PROCESSOR
-       select ACPI_CPPC_LIB
-       help
-         This adds a CPUFreq driver which uses CPPC methods
-         as described in the ACPIv5.1 spec. CPPC stands for
-         Collaborative Processor Performance Controls. It
-         is based on an abstract continuous scale of CPU
-         performance values which allows the remote power
-         processor to flexibly optimize for power and
-         performance. CPPC relies on power management firmware
-         support for its operation.
-
-         If in doubt, say N.
-
-config ACPI_CPPC_CPUFREQ_FIE
-       bool "Frequency Invariance support for CPPC cpufreq driver"
-       depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY
-       default y
-       help
-         This extends frequency invariance support in the CPPC cpufreq driver,
-         by using CPPC delivered and reference performance counters.
-
-         If in doubt, say N.
-
 config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM
        tristate "Allwinner nvmem based SUN50I CPUFreq driver"
        depends on ARCH_SUNXI
index e8094fc92491ebd2754c7cf89223ee5f658529d4..a6e123dfe394d8a6b50063cee9bdf4fdf8ad1b48 100644 (file)
@@ -73,26 +73,6 @@ static inline bool sbi_is_domain_state_available(void)
        return data->available;
 }
 
-static int sbi_suspend_finisher(unsigned long suspend_type,
-                               unsigned long resume_addr,
-                               unsigned long opaque)
-{
-       struct sbiret ret;
-
-       ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND,
-                       suspend_type, resume_addr, opaque, 0, 0, 0);
-
-       return (ret.error) ? sbi_err_map_linux_errno(ret.error) : 0;
-}
-
-static int sbi_suspend(u32 state)
-{
-       if (state & SBI_HSM_SUSP_NON_RET_BIT)
-               return cpu_suspend(state, sbi_suspend_finisher);
-       else
-               return sbi_suspend_finisher(state, 0, 0);
-}
-
 static __cpuidle int sbi_cpuidle_enter_state(struct cpuidle_device *dev,
                                             struct cpuidle_driver *drv, int idx)
 {
@@ -100,9 +80,9 @@ static __cpuidle int sbi_cpuidle_enter_state(struct cpuidle_device *dev,
        u32 state = states[idx];
 
        if (state & SBI_HSM_SUSP_NON_RET_BIT)
-               return CPU_PM_CPU_IDLE_ENTER_PARAM(sbi_suspend, idx, state);
+               return CPU_PM_CPU_IDLE_ENTER_PARAM(riscv_sbi_hart_suspend, idx, state);
        else
-               return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(sbi_suspend,
+               return CPU_PM_CPU_IDLE_ENTER_RETENTION_PARAM(riscv_sbi_hart_suspend,
                                                             idx, state);
 }
 
@@ -133,7 +113,7 @@ static __cpuidle int __sbi_enter_domain_idle_state(struct cpuidle_device *dev,
        else
                state = states[idx];
 
-       ret = sbi_suspend(state) ? -1 : idx;
+       ret = riscv_sbi_hart_suspend(state) ? -1 : idx;
 
        ct_cpuidle_exit();
 
@@ -206,17 +186,6 @@ static const struct of_device_id sbi_cpuidle_state_match[] = {
        { },
 };
 
-static bool sbi_suspend_state_is_valid(u32 state)
-{
-       if (state > SBI_HSM_SUSPEND_RET_DEFAULT &&
-           state < SBI_HSM_SUSPEND_RET_PLATFORM)
-               return false;
-       if (state > SBI_HSM_SUSPEND_NON_RET_DEFAULT &&
-           state < SBI_HSM_SUSPEND_NON_RET_PLATFORM)
-               return false;
-       return true;
-}
-
 static int sbi_dt_parse_state_node(struct device_node *np, u32 *state)
 {
        int err = of_property_read_u32(np, "riscv,sbi-suspend-param", state);
@@ -226,7 +195,7 @@ static int sbi_dt_parse_state_node(struct device_node *np, u32 *state)
                return err;
        }
 
-       if (!sbi_suspend_state_is_valid(*state)) {
+       if (!riscv_sbi_suspend_state_is_valid(*state)) {
                pr_warn("Invalid SBI suspend state %#x\n", *state);
                return -EINVAL;
        }
@@ -607,16 +576,8 @@ static int __init sbi_cpuidle_init(void)
        int ret;
        struct platform_device *pdev;
 
-       /*
-        * The SBI HSM suspend function is only available when:
-        * 1) SBI version is 0.3 or higher
-        * 2) SBI HSM extension is available
-        */
-       if ((sbi_spec_version < sbi_mk_version(0, 3)) ||
-           !sbi_probe_extension(SBI_EXT_HSM)) {
-               pr_info("HSM suspend not available\n");
+       if (!riscv_sbi_hsm_is_supported())
                return 0;
-       }
 
        ret = platform_driver_register(&sbi_cpuidle_driver);
        if (ret)
index e8d01b14ccdde7848c7fb14489a5a015b86e8e95..0cd6b48a5dbf95a865961ca2d8e3b503b89cdfbf 100644 (file)
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/smp.h>
+#include <linux/soc/andes/irq.h>
 
 static struct irq_domain *intc_domain;
+static unsigned int riscv_intc_nr_irqs __ro_after_init = BITS_PER_LONG;
+static unsigned int riscv_intc_custom_base __ro_after_init = BITS_PER_LONG;
+static unsigned int riscv_intc_custom_nr_irqs __ro_after_init;
 
 static asmlinkage void riscv_intc_irq(struct pt_regs *regs)
 {
        unsigned long cause = regs->cause & ~CAUSE_IRQ_FLAG;
 
-       if (unlikely(cause >= BITS_PER_LONG))
-               panic("unexpected interrupt cause");
-
-       generic_handle_domain_irq(intc_domain, cause);
+       if (generic_handle_domain_irq(intc_domain, cause))
+               pr_warn_ratelimited("Failed to handle interrupt (cause: %ld)\n", cause);
 }
 
 /*
@@ -47,6 +49,31 @@ static void riscv_intc_irq_unmask(struct irq_data *d)
        csr_set(CSR_IE, BIT(d->hwirq));
 }
 
+static void andes_intc_irq_mask(struct irq_data *d)
+{
+       /*
+        * Andes specific S-mode local interrupt causes (hwirq)
+        * are defined as (256 + n) and controlled by n-th bit
+        * of SLIE.
+        */
+       unsigned int mask = BIT(d->hwirq % BITS_PER_LONG);
+
+       if (d->hwirq < ANDES_SLI_CAUSE_BASE)
+               csr_clear(CSR_IE, mask);
+       else
+               csr_clear(ANDES_CSR_SLIE, mask);
+}
+
+static void andes_intc_irq_unmask(struct irq_data *d)
+{
+       unsigned int mask = BIT(d->hwirq % BITS_PER_LONG);
+
+       if (d->hwirq < ANDES_SLI_CAUSE_BASE)
+               csr_set(CSR_IE, mask);
+       else
+               csr_set(ANDES_CSR_SLIE, mask);
+}
+
 static void riscv_intc_irq_eoi(struct irq_data *d)
 {
        /*
@@ -70,12 +97,21 @@ static struct irq_chip riscv_intc_chip = {
        .irq_eoi = riscv_intc_irq_eoi,
 };
 
+static struct irq_chip andes_intc_chip = {
+       .name           = "RISC-V INTC",
+       .irq_mask       = andes_intc_irq_mask,
+       .irq_unmask     = andes_intc_irq_unmask,
+       .irq_eoi        = riscv_intc_irq_eoi,
+};
+
 static int riscv_intc_domain_map(struct irq_domain *d, unsigned int irq,
                                 irq_hw_number_t hwirq)
 {
+       struct irq_chip *chip = d->host_data;
+
        irq_set_percpu_devid(irq);
-       irq_domain_set_info(d, irq, hwirq, &riscv_intc_chip, d->host_data,
-                           handle_percpu_devid_irq, NULL, NULL);
+       irq_domain_set_info(d, irq, hwirq, chip, NULL, handle_percpu_devid_irq,
+                           NULL, NULL);
 
        return 0;
 }
@@ -93,6 +129,14 @@ static int riscv_intc_domain_alloc(struct irq_domain *domain,
        if (ret)
                return ret;
 
+       /*
+        * Only allow hwirq for which we have corresponding standard or
+        * custom interrupt enable register.
+        */
+       if ((hwirq >= riscv_intc_nr_irqs && hwirq < riscv_intc_custom_base) ||
+           (hwirq >= riscv_intc_custom_base + riscv_intc_custom_nr_irqs))
+               return -EINVAL;
+
        for (i = 0; i < nr_irqs; i++) {
                ret = riscv_intc_domain_map(domain, virq + i, hwirq + i);
                if (ret)
@@ -113,12 +157,12 @@ static struct fwnode_handle *riscv_intc_hwnode(void)
        return intc_domain->fwnode;
 }
 
-static int __init riscv_intc_init_common(struct fwnode_handle *fn)
+static int __init riscv_intc_init_common(struct fwnode_handle *fn,
+                                        struct irq_chip *chip)
 {
        int rc;
 
-       intc_domain = irq_domain_create_linear(fn, BITS_PER_LONG,
-                                              &riscv_intc_domain_ops, NULL);
+       intc_domain = irq_domain_create_tree(fn, &riscv_intc_domain_ops, chip);
        if (!intc_domain) {
                pr_err("unable to add IRQ domain\n");
                return -ENXIO;
@@ -132,7 +176,11 @@ static int __init riscv_intc_init_common(struct fwnode_handle *fn)
 
        riscv_set_intc_hwnode_fn(riscv_intc_hwnode);
 
-       pr_info("%d local interrupts mapped\n", BITS_PER_LONG);
+       pr_info("%d local interrupts mapped\n", riscv_intc_nr_irqs);
+       if (riscv_intc_custom_nr_irqs) {
+               pr_info("%d custom local interrupts mapped\n",
+                       riscv_intc_custom_nr_irqs);
+       }
 
        return 0;
 }
@@ -140,8 +188,9 @@ static int __init riscv_intc_init_common(struct fwnode_handle *fn)
 static int __init riscv_intc_init(struct device_node *node,
                                  struct device_node *parent)
 {
-       int rc;
+       struct irq_chip *chip = &riscv_intc_chip;
        unsigned long hartid;
+       int rc;
 
        rc = riscv_of_parent_hartid(node, &hartid);
        if (rc < 0) {
@@ -166,10 +215,17 @@ static int __init riscv_intc_init(struct device_node *node,
                return 0;
        }
 
-       return riscv_intc_init_common(of_node_to_fwnode(node));
+       if (of_device_is_compatible(node, "andestech,cpu-intc")) {
+               riscv_intc_custom_base = ANDES_SLI_CAUSE_BASE;
+               riscv_intc_custom_nr_irqs = ANDES_RV_IRQ_LAST;
+               chip = &andes_intc_chip;
+       }
+
+       return riscv_intc_init_common(of_node_to_fwnode(node), chip);
 }
 
 IRQCHIP_DECLARE(riscv, "riscv,cpu-intc", riscv_intc_init);
+IRQCHIP_DECLARE(andes, "andestech,cpu-intc", riscv_intc_init);
 
 #ifdef CONFIG_ACPI
 
@@ -196,7 +252,7 @@ static int __init riscv_intc_acpi_init(union acpi_subtable_headers *header,
                return -ENOMEM;
        }
 
-       return riscv_intc_init_common(fn);
+       return riscv_intc_init_common(fn, &riscv_intc_chip);
 }
 
 IRQCHIP_ACPI_DECLARE(riscv_intc, ACPI_MADT_TYPE_RINTC, NULL,
index ec6e0d9194a1c577c1378444470a23ff614fc101..564e813d8c69b2195b84eb54cd43490733fe21de 100644 (file)
@@ -86,6 +86,20 @@ config RISCV_PMU_SBI
          full perf feature support i.e. counter overflow, privilege mode
          filtering, counter configuration.
 
+config ANDES_CUSTOM_PMU
+       bool "Andes custom PMU support"
+       depends on ARCH_RENESAS && RISCV_ALTERNATIVE && RISCV_PMU_SBI
+       default y
+       help
+         The Andes cores implement the PMU overflow extension very
+         similar to the standard Sscofpmf and Smcntrpmf extension.
+
+         This will patch the overflow and pending CSRs and handle the
+         non-standard behaviour via the regular SBI PMU driver and
+         interface.
+
+         If you don't know what to do here, say "Y".
+
 config ARM_PMU_ACPI
        depends on ARM_PMU && ACPI
        def_bool y
index 16acd4dcdb96c75e07b45a3745a71842f2d7d2b8..bbd6fe021b3a982f62eae7a7c7818c8d080967a9 100644 (file)
 #include <linux/of.h>
 #include <linux/cpu_pm.h>
 #include <linux/sched/clock.h>
+#include <linux/soc/andes/irq.h>
 
 #include <asm/errata_list.h>
 #include <asm/sbi.h>
 #include <asm/cpufeature.h>
 
+#define ALT_SBI_PMU_OVERFLOW(__ovl)                                    \
+asm volatile(ALTERNATIVE_2(                                            \
+       "csrr %0, " __stringify(CSR_SSCOUNTOVF),                        \
+       "csrr %0, " __stringify(THEAD_C9XX_CSR_SCOUNTEROF),             \
+               THEAD_VENDOR_ID, ERRATA_THEAD_PMU,                      \
+               CONFIG_ERRATA_THEAD_PMU,                                \
+       "csrr %0, " __stringify(ANDES_CSR_SCOUNTEROF),                  \
+               0, RISCV_ISA_EXT_XANDESPMU,                             \
+               CONFIG_ANDES_CUSTOM_PMU)                                \
+       : "=r" (__ovl) :                                                \
+       : "memory")
+
+#define ALT_SBI_PMU_OVF_CLEAR_PENDING(__irq_mask)                      \
+asm volatile(ALTERNATIVE(                                              \
+       "csrc " __stringify(CSR_IP) ", %0\n\t",                         \
+       "csrc " __stringify(ANDES_CSR_SLIP) ", %0\n\t",                 \
+               0, RISCV_ISA_EXT_XANDESPMU,                             \
+               CONFIG_ANDES_CUSTOM_PMU)                                \
+       : : "r"(__irq_mask)                                             \
+       : "memory")
+
 #define SYSCTL_NO_USER_ACCESS  0
 #define SYSCTL_USER_ACCESS     1
 #define SYSCTL_LEGACY          2
@@ -61,6 +83,7 @@ static int sysctl_perf_user_access __read_mostly = SYSCTL_USER_ACCESS;
 static union sbi_pmu_ctr_info *pmu_ctr_list;
 static bool riscv_pmu_use_irq;
 static unsigned int riscv_pmu_irq_num;
+static unsigned int riscv_pmu_irq_mask;
 static unsigned int riscv_pmu_irq;
 
 /* Cache the available counters in a bitmask */
@@ -694,7 +717,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
 
        event = cpu_hw_evt->events[fidx];
        if (!event) {
-               csr_clear(CSR_SIP, BIT(riscv_pmu_irq_num));
+               ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask);
                return IRQ_NONE;
        }
 
@@ -708,7 +731,7 @@ static irqreturn_t pmu_sbi_ovf_handler(int irq, void *dev)
         * Overflow interrupt pending bit should only be cleared after stopping
         * all the counters to avoid any race condition.
         */
-       csr_clear(CSR_SIP, BIT(riscv_pmu_irq_num));
+       ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask);
 
        /* No overflow bit is set */
        if (!overflow)
@@ -780,8 +803,7 @@ static int pmu_sbi_starting_cpu(unsigned int cpu, struct hlist_node *node)
 
        if (riscv_pmu_use_irq) {
                cpu_hw_evt->irq = riscv_pmu_irq;
-               csr_clear(CSR_IP, BIT(riscv_pmu_irq_num));
-               csr_set(CSR_IE, BIT(riscv_pmu_irq_num));
+               ALT_SBI_PMU_OVF_CLEAR_PENDING(riscv_pmu_irq_mask);
                enable_percpu_irq(riscv_pmu_irq, IRQ_TYPE_NONE);
        }
 
@@ -792,7 +814,6 @@ static int pmu_sbi_dying_cpu(unsigned int cpu, struct hlist_node *node)
 {
        if (riscv_pmu_use_irq) {
                disable_percpu_irq(riscv_pmu_irq);
-               csr_clear(CSR_IE, BIT(riscv_pmu_irq_num));
        }
 
        /* Disable all counters access for user mode now */
@@ -816,8 +837,14 @@ static int pmu_sbi_setup_irqs(struct riscv_pmu *pmu, struct platform_device *pde
                   riscv_cached_mimpid(0) == 0) {
                riscv_pmu_irq_num = THEAD_C9XX_RV_IRQ_PMU;
                riscv_pmu_use_irq = true;
+       } else if (riscv_isa_extension_available(NULL, XANDESPMU) &&
+                  IS_ENABLED(CONFIG_ANDES_CUSTOM_PMU)) {
+               riscv_pmu_irq_num = ANDES_SLI_CAUSE_BASE + ANDES_RV_IRQ_PMOVI;
+               riscv_pmu_use_irq = true;
        }
 
+       riscv_pmu_irq_mask = BIT(riscv_pmu_irq_num % BITS_PER_LONG);
+
        if (!riscv_pmu_use_irq)
                return -EOPNOTSUPP;
 
index 39e56e1c72032413a082d6f336e01ba5d3d8be92..446fea6dda78b97953d363945a1599712e371b39 100644 (file)
@@ -5,12 +5,12 @@
 #include <asm/types.h>
 
 /**
- * __ffs - find first bit in word.
+ * generic___ffs - find first bit in word.
  * @word: The word to search
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned long __ffs(unsigned long word)
+static __always_inline unsigned long generic___ffs(unsigned long word)
 {
        int num = 0;
 
@@ -41,4 +41,8 @@ static __always_inline unsigned long __ffs(unsigned long word)
        return num;
 }
 
+#ifndef __HAVE_ARCH___FFS
+#define __ffs(word) generic___ffs(word)
+#endif
+
 #endif /* _ASM_GENERIC_BITOPS___FFS_H_ */
index 03f721a8a2b1993734e458399a68848637112c87..54ccccf96e21eaad9cf4a681d304ad8ac689f371 100644 (file)
@@ -5,12 +5,12 @@
 #include <asm/types.h>
 
 /**
- * __fls - find last (most-significant) set bit in a long word
+ * generic___fls - find last (most-significant) set bit in a long word
  * @word: the word to search
  *
  * Undefined if no set bit exists, so code should check against 0 first.
  */
-static __always_inline unsigned long __fls(unsigned long word)
+static __always_inline unsigned long generic___fls(unsigned long word)
 {
        int num = BITS_PER_LONG - 1;
 
@@ -41,4 +41,8 @@ static __always_inline unsigned long __fls(unsigned long word)
        return num;
 }
 
+#ifndef __HAVE_ARCH___FLS
+#define __fls(word) generic___fls(word)
+#endif
+
 #endif /* _ASM_GENERIC_BITOPS___FLS_H_ */
index 323fd5d6ae263ab329ab67f6aa678ec324a3885d..4c43f242daeb172f3bb42dd085d9c36c7f616587 100644 (file)
@@ -3,14 +3,14 @@
 #define _ASM_GENERIC_BITOPS_FFS_H_
 
 /**
- * ffs - find first bit set
+ * generic_ffs - find first bit set
  * @x: the word to search
  *
  * This is defined the same way as
  * the libc and compiler builtin ffs routines, therefore
  * differs in spirit from ffz (man ffs).
  */
-static inline int ffs(int x)
+static inline int generic_ffs(int x)
 {
        int r = 1;
 
@@ -39,4 +39,8 @@ static inline int ffs(int x)
        return r;
 }
 
+#ifndef __HAVE_ARCH_FFS
+#define ffs(x) generic_ffs(x)
+#endif
+
 #endif /* _ASM_GENERIC_BITOPS_FFS_H_ */
index b168bb10e1be17bb6394e749c238da3940ea3a01..26f3ce1dd6e44872000d7066d4b3e37325e915fa 100644 (file)
@@ -3,14 +3,14 @@
 #define _ASM_GENERIC_BITOPS_FLS_H_
 
 /**
- * fls - find last (most-significant) bit set
+ * generic_fls - find last (most-significant) bit set
  * @x: the word to search
  *
  * This is defined the same way as ffs.
  * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
  */
 
-static __always_inline int fls(unsigned int x)
+static __always_inline int generic_fls(unsigned int x)
 {
        int r = 32;
 
@@ -39,4 +39,8 @@ static __always_inline int fls(unsigned int x)
        return r;
 }
 
+#ifndef __HAVE_ARCH_FLS
+#define fls(x) generic_fls(x)
+#endif
+
 #endif /* _ASM_GENERIC_BITOPS_FLS_H_ */
index f5a97dec51694894a979dd1d045a6b982622e09c..2488c0c5a2881ead26ab68e8ab32d66c41726b91 100644 (file)
@@ -86,7 +86,7 @@ extern int sysctl_legacy_va_layout;
 
 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
 extern const int mmap_rnd_bits_min;
-extern const int mmap_rnd_bits_max;
+extern int mmap_rnd_bits_max __ro_after_init;
 extern int mmap_rnd_bits __read_mostly;
 #endif
 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
diff --git a/include/linux/soc/andes/irq.h b/include/linux/soc/andes/irq.h
new file mode 100644 (file)
index 0000000..edc3182
--- /dev/null
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 Andes Technology Corporation
+ */
+#ifndef __ANDES_IRQ_H
+#define __ANDES_IRQ_H
+
+/* Andes PMU irq number */
+#define ANDES_RV_IRQ_PMOVI             18
+#define ANDES_RV_IRQ_LAST              ANDES_RV_IRQ_PMOVI
+#define ANDES_SLI_CAUSE_BASE           256
+
+/* Andes PMU related registers */
+#define ANDES_CSR_SLIE                 0x9c4
+#define ANDES_CSR_SLIP                 0x9c5
+#define ANDES_CSR_SCOUNTEROF           0x9d4
+
+#endif /* __ANDES_IRQ_H */
index 013da4b8b3272c6aff56a84cc40ce8d8a5d17041..67bb9794b87585c5d4584205879fbcb469fa0469 100644 (file)
@@ -17,5 +17,19 @@ static inline void sync_core_before_usermode(void)
 }
 #endif
 
-#endif /* _LINUX_SYNC_CORE_H */
+#ifdef CONFIG_ARCH_HAS_PREPARE_SYNC_CORE_CMD
+#include <asm/sync_core.h>
+#else
+/*
+ * This is a dummy prepare_sync_core_cmd() implementation that can be used on
+ * all architectures which provide unconditional core serializing instructions
+ * in switch_mm().
+ * If your architecture doesn't provide such core serializing instructions in
+ * switch_mm(), you may need to write your own functions.
+ */
+static inline void prepare_sync_core_cmd(struct mm_struct *mm)
+{
+}
+#endif
 
+#endif /* _LINUX_SYNC_CORE_H */
index 8df18f3a974846b48e41b2a8dcbc2f2f2f90128e..c3994b92333ded7989a672cf0e594896debfad67 100644 (file)
@@ -1970,6 +1970,9 @@ source "kernel/Kconfig.locks"
 config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
        bool
 
+config ARCH_HAS_PREPARE_SYNC_CORE_CMD
+       bool
+
 config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
        bool
 
index 9116bcc903467fe0d5854e3deb06b8d334cf85eb..e4a87bcf28d405407dc3cdbbd2ae3459eb11c30c 100644 (file)
@@ -6638,7 +6638,9 @@ static void __sched notrace __schedule(unsigned int sched_mode)
         *     if (signal_pending_state())          if (p->state & @state)
         *
         * Also, the membarrier system call requires a full memory barrier
-        * after coming from user-space, before storing to rq->curr.
+        * after coming from user-space, before storing to rq->curr; this
+        * barrier matches a full barrier in the proximity of the membarrier
+        * system call exit.
         */
        rq_lock(rq, &rf);
        smp_mb__after_spinlock();
@@ -6709,12 +6711,20 @@ static void __sched notrace __schedule(unsigned int sched_mode)
                 *
                 * Here are the schemes providing that barrier on the
                 * various architectures:
-                * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
-                *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
+                * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC,
+                *   RISC-V.  switch_mm() relies on membarrier_arch_switch_mm()
+                *   on PowerPC and on RISC-V.
                 * - finish_lock_switch() for weakly-ordered
                 *   architectures where spin_unlock is a full barrier,
                 * - switch_to() for arm64 (weakly-ordered, spin_unlock
                 *   is a RELEASE barrier),
+                *
+                * The barrier matches a full barrier in the proximity of
+                * the membarrier system call entry.
+                *
+                * On RISC-V, this barrier pairing is also needed for the
+                * SYNC_CORE command when switching between processes, cf.
+                * the inline comments in membarrier_arch_switch_mm().
                 */
                ++*switch_count;
 
index 2ad881d07752c15f60a4c14bee21051117d5aeb2..703e8d80a576d171c410d14117000dd1fc47d2b0 100644 (file)
@@ -251,7 +251,7 @@ static int membarrier_global_expedited(void)
                return 0;
 
        /*
-        * Matches memory barriers around rq->curr modification in
+        * Matches memory barriers after rq->curr modification in
         * scheduler.
         */
        smp_mb();       /* system call entry is not a mb. */
@@ -300,7 +300,7 @@ static int membarrier_global_expedited(void)
 
        /*
         * Memory barrier on the caller thread _after_ we finished
-        * waiting for the last IPI. Matches memory barriers around
+        * waiting for the last IPI. Matches memory barriers before
         * rq->curr modification in scheduler.
         */
        smp_mb();       /* exit from system call is not a mb */
@@ -320,6 +320,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
                      MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
                        return -EPERM;
                ipi_func = ipi_sync_core;
+               prepare_sync_core_cmd(mm);
        } else if (flags == MEMBARRIER_FLAG_RSEQ) {
                if (!IS_ENABLED(CONFIG_RSEQ))
                        return -EINVAL;
@@ -339,8 +340,12 @@ static int membarrier_private_expedited(int flags, int cpu_id)
                return 0;
 
        /*
-        * Matches memory barriers around rq->curr modification in
+        * Matches memory barriers after rq->curr modification in
         * scheduler.
+        *
+        * On RISC-V, this barrier pairing is also needed for the
+        * SYNC_CORE command when switching between processes, cf.
+        * the inline comments in membarrier_arch_switch_mm().
         */
        smp_mb();       /* system call entry is not a mb. */
 
@@ -415,7 +420,7 @@ out:
 
        /*
         * Memory barrier on the caller thread _after_ we finished
-        * waiting for the last IPI. Matches memory barriers around
+        * waiting for the last IPI. Matches memory barriers before
         * rq->curr modification in scheduler.
         */
        smp_mb();       /* exit from system call is not a mb */
index b78e83d351d2864a6a339059ac734b6602eb5824..8f47011de22e49b35258cc6f1bf3eaa12170c737 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -64,7 +64,7 @@
 
 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
 const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
-const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
+int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX;
 int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
 #endif
 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
index 5a84b6443875c47013348bfed85ebefe8c6da4db..3ee8ecfb8c044c3bf65461e81af5a9e95391fa44 100644 (file)
@@ -33,7 +33,7 @@ ld-option = $(success,$(LD) -v $(1))
 
 # $(as-instr,<instr>)
 # Return y if the assembler supports <instr>, n otherwise
-as-instr = $(success,printf "%b\n" "$(1)" | $(CC) $(CLANG_FLAGS) -c -x assembler-with-cpp -o /dev/null -)
+as-instr = $(success,printf "%b\n" "$(1)" | $(CC) $(CLANG_FLAGS) -Wa$(comma)--fatal-warnings -c -x assembler-with-cpp -o /dev/null -)
 
 # check if $(CC) and $(LD) exist
 $(error-if,$(failure,command -v $(CC)),C compiler '$(CC)' not found)
index 8fcb427405a6f17f61655a6d0881c433f22e1dd6..92be0c9a13eeb51beca06abe15bfe22c6e72bfcb 100644 (file)
@@ -38,7 +38,7 @@ as-option = $(call try-run,\
 # Usage: aflags-y += $(call as-instr,instr,option1,option2)
 
 as-instr = $(call try-run,\
-       printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
+       printf "%b\n" "$(1)" | $(CC) -Werror $(CLANG_FLAGS) $(KBUILD_AFLAGS) -Wa$(comma)--fatal-warnings -c -x assembler-with-cpp -o "$$TMP" -,$(2),$(3))
 
 # __cc-option
 # Usage: MY_CFLAGS += $(call __cc-option,$(CC),$(MY_CFLAGS),-march=winchip-c6,-march=i586)
diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/firmware.json
new file mode 100644 (file)
index 0000000..9b4a032
--- /dev/null
@@ -0,0 +1,68 @@
+[
+  {
+    "ArchStdEvent": "FW_MISALIGNED_LOAD"
+  },
+  {
+    "ArchStdEvent": "FW_MISALIGNED_STORE"
+  },
+  {
+    "ArchStdEvent": "FW_ACCESS_LOAD"
+  },
+  {
+    "ArchStdEvent": "FW_ACCESS_STORE"
+  },
+  {
+    "ArchStdEvent": "FW_ILLEGAL_INSN"
+  },
+  {
+    "ArchStdEvent": "FW_SET_TIMER"
+  },
+  {
+    "ArchStdEvent": "FW_IPI_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_IPI_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_FENCE_I_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_FENCE_I_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_SFENCE_VMA_ASID_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_VMID_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_GVMA_VMID_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_RECEIVED"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_ASID_SENT"
+  },
+  {
+    "ArchStdEvent": "FW_HFENCE_VVMA_ASID_RECEIVED"
+  }
+]
diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/instructions.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/instructions.json
new file mode 100644 (file)
index 0000000..713a08c
--- /dev/null
@@ -0,0 +1,127 @@
+[
+       {
+               "EventCode": "0x10",
+               "EventName": "cycle_count",
+               "BriefDescription": "Cycle count"
+       },
+       {
+               "EventCode": "0x20",
+               "EventName": "inst_count",
+               "BriefDescription": "Retired instruction count"
+       },
+       {
+               "EventCode": "0x30",
+               "EventName": "int_load_inst",
+               "BriefDescription": "Integer load instruction count"
+       },
+       {
+               "EventCode": "0x40",
+               "EventName": "int_store_inst",
+               "BriefDescription": "Integer store instruction count"
+       },
+       {
+               "EventCode": "0x50",
+               "EventName": "atomic_inst",
+               "BriefDescription": "Atomic instruction count"
+       },
+       {
+               "EventCode": "0x60",
+               "EventName": "sys_inst",
+               "BriefDescription": "System instruction count"
+       },
+       {
+               "EventCode": "0x70",
+               "EventName": "int_compute_inst",
+               "BriefDescription": "Integer computational instruction count"
+       },
+       {
+               "EventCode": "0x80",
+               "EventName": "condition_br",
+               "BriefDescription": "Conditional branch instruction count"
+       },
+       {
+               "EventCode": "0x90",
+               "EventName": "taken_condition_br",
+               "BriefDescription": "Taken conditional branch instruction count"
+       },
+       {
+               "EventCode": "0xA0",
+               "EventName": "jal_inst",
+               "BriefDescription": "JAL instruction count"
+       },
+       {
+               "EventCode": "0xB0",
+               "EventName": "jalr_inst",
+               "BriefDescription": "JALR instruction count"
+       },
+       {
+               "EventCode": "0xC0",
+               "EventName": "ret_inst",
+               "BriefDescription": "Return instruction count"
+       },
+       {
+               "EventCode": "0xD0",
+               "EventName": "control_trans_inst",
+               "BriefDescription": "Control transfer instruction count"
+       },
+       {
+               "EventCode": "0xE0",
+               "EventName": "ex9_inst",
+               "BriefDescription": "EXEC.IT instruction count"
+       },
+       {
+               "EventCode": "0xF0",
+               "EventName": "int_mul_inst",
+               "BriefDescription": "Integer multiplication instruction count"
+       },
+       {
+               "EventCode": "0x100",
+               "EventName": "int_div_rem_inst",
+               "BriefDescription": "Integer division/remainder instruction count"
+       },
+       {
+               "EventCode": "0x110",
+               "EventName": "float_load_inst",
+               "BriefDescription": "Floating-point load instruction count"
+       },
+       {
+               "EventCode": "0x120",
+               "EventName": "float_store_inst",
+               "BriefDescription": "Floating-point store instruction count"
+       },
+       {
+               "EventCode": "0x130",
+               "EventName": "float_add_sub_inst",
+               "BriefDescription": "Floating-point addition/subtraction instruction count"
+       },
+       {
+               "EventCode": "0x140",
+               "EventName": "float_mul_inst",
+               "BriefDescription": "Floating-point multiplication instruction count"
+       },
+       {
+               "EventCode": "0x150",
+               "EventName": "float_fused_muladd_inst",
+               "BriefDescription": "Floating-point fused multiply-add instruction count"
+       },
+       {
+               "EventCode": "0x160",
+               "EventName": "float_div_sqrt_inst",
+               "BriefDescription": "Floating-point division or square-root instruction count"
+       },
+       {
+               "EventCode": "0x170",
+               "EventName": "other_float_inst",
+               "BriefDescription": "Other floating-point instruction count"
+       },
+       {
+               "EventCode": "0x180",
+               "EventName": "int_mul_add_sub_inst",
+               "BriefDescription": "Integer multiplication and add/sub instruction count"
+       },
+       {
+               "EventCode": "0x190",
+               "EventName": "retired_ops",
+               "BriefDescription": "Retired operation count"
+       }
+]
diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/memory.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/memory.json
new file mode 100644 (file)
index 0000000..c7401b5
--- /dev/null
@@ -0,0 +1,57 @@
+[
+       {
+               "EventCode": "0x01",
+               "EventName": "ilm_access",
+               "BriefDescription": "ILM access"
+       },
+       {
+               "EventCode": "0x11",
+               "EventName": "dlm_access",
+               "BriefDescription": "DLM access"
+       },
+       {
+               "EventCode": "0x21",
+               "EventName": "icache_access",
+               "BriefDescription": "ICACHE access"
+       },
+       {
+               "EventCode": "0x31",
+               "EventName": "icache_miss",
+               "BriefDescription": "ICACHE miss"
+       },
+       {
+               "EventCode": "0x41",
+               "EventName": "dcache_access",
+               "BriefDescription": "DCACHE access"
+       },
+       {
+               "EventCode": "0x51",
+               "EventName": "dcache_miss",
+               "BriefDescription": "DCACHE miss"
+       },
+       {
+               "EventCode": "0x61",
+               "EventName": "dcache_load_access",
+               "BriefDescription": "DCACHE load access"
+       },
+       {
+               "EventCode": "0x71",
+               "EventName": "dcache_load_miss",
+               "BriefDescription": "DCACHE load miss"
+       },
+       {
+               "EventCode": "0x81",
+               "EventName": "dcache_store_access",
+               "BriefDescription": "DCACHE store access"
+       },
+       {
+               "EventCode": "0x91",
+               "EventName": "dcache_store_miss",
+               "BriefDescription": "DCACHE store miss"
+       },
+       {
+               "EventCode": "0xA1",
+               "EventName": "dcache_wb",
+               "BriefDescription": "DCACHE writeback"
+       }
+]
diff --git a/tools/perf/pmu-events/arch/riscv/andes/ax45/microarch.json b/tools/perf/pmu-events/arch/riscv/andes/ax45/microarch.json
new file mode 100644 (file)
index 0000000..a6d378c
--- /dev/null
@@ -0,0 +1,77 @@
+[
+       {
+               "EventCode": "0xB1",
+               "EventName": "cycle_wait_icache_fill",
+               "BriefDescription": "Cycles waiting for ICACHE fill data"
+       },
+       {
+               "EventCode": "0xC1",
+               "EventName": "cycle_wait_dcache_fill",
+               "BriefDescription": "Cycles waiting for DCACHE fill data"
+       },
+       {
+               "EventCode": "0xD1",
+               "EventName": "uncached_ifetch_from_bus",
+               "BriefDescription": "Uncached ifetch data access from bus"
+       },
+       {
+               "EventCode": "0xE1",
+               "EventName": "uncached_load_from_bus",
+               "BriefDescription": "Uncached load data access from bus"
+       },
+       {
+               "EventCode": "0xF1",
+               "EventName": "cycle_wait_uncached_ifetch",
+               "BriefDescription": "Cycles waiting for uncached ifetch data from bus"
+       },
+       {
+               "EventCode": "0x101",
+               "EventName": "cycle_wait_uncached_load",
+               "BriefDescription": "Cycles waiting for uncached load data from bus"
+       },
+       {
+               "EventCode": "0x111",
+               "EventName": "main_itlb_access",
+               "BriefDescription": "Main ITLB access"
+       },
+       {
+               "EventCode": "0x121",
+               "EventName": "main_itlb_miss",
+               "BriefDescription": "Main ITLB miss"
+       },
+       {
+               "EventCode": "0x131",
+               "EventName": "main_dtlb_access",
+               "BriefDescription": "Main DTLB access"
+       },
+       {
+               "EventCode": "0x141",
+               "EventName": "main_dtlb_miss",
+               "BriefDescription": "Main DTLB miss"
+       },
+       {
+               "EventCode": "0x151",
+               "EventName": "cycle_wait_itlb_fill",
+               "BriefDescription": "Cycles waiting for Main ITLB fill data"
+       },
+       {
+               "EventCode": "0x161",
+               "EventName": "pipe_stall_cycle_dtlb_miss",
+               "BriefDescription": "Pipeline stall cycles caused by Main DTLB miss"
+       },
+       {
+               "EventCode": "0x02",
+               "EventName": "mispredict_condition_br",
+               "BriefDescription": "Misprediction of conditional branches"
+       },
+       {
+               "EventCode": "0x12",
+               "EventName": "mispredict_take_condition_br",
+               "BriefDescription": "Misprediction of taken conditional branches"
+       },
+       {
+               "EventCode": "0x22",
+               "EventName": "mispredict_target_ret_inst",
+               "BriefDescription": "Misprediction of targets of Return instructions"
+       }
+]
index cfc449b198105ebe5004c0565de85499ff14f319..3d3a809a5446e8e811dcd6cc243c33c58811e0b1 100644 (file)
@@ -17,3 +17,4 @@
 0x489-0x8000000000000007-0x[[:xdigit:]]+,v1,sifive/u74,core
 0x5b7-0x0-0x0,v1,thead/c900-legacy,core
 0x67e-0x80000000db0000[89]0-0x[[:xdigit:]]+,v1,starfive/dubhe-80,core
+0x31e-0x8000000000008a45-0x[[:xdigit:]]+,v1,andes/ax45,core
index 1757d19ca89b1bc06e6f9391440f502dd74adcbd..7f7d3eb8b9c9267d6e13a9907ade9bf5ed2f36dc 100644 (file)
@@ -6,30 +6,9 @@
 
 TEST(infinite_rlimit)
 {
-// Only works on 64 bit
-#if __riscv_xlen == 64
-       struct addresses mmap_addresses;
-
        EXPECT_EQ(BOTTOM_UP, memory_layout());
 
-       do_mmaps(&mmap_addresses);
-
-       EXPECT_NE(MAP_FAILED, mmap_addresses.no_hint);
-       EXPECT_NE(MAP_FAILED, mmap_addresses.on_37_addr);
-       EXPECT_NE(MAP_FAILED, mmap_addresses.on_38_addr);
-       EXPECT_NE(MAP_FAILED, mmap_addresses.on_46_addr);
-       EXPECT_NE(MAP_FAILED, mmap_addresses.on_47_addr);
-       EXPECT_NE(MAP_FAILED, mmap_addresses.on_55_addr);
-       EXPECT_NE(MAP_FAILED, mmap_addresses.on_56_addr);
-
-       EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.no_hint);
-       EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_37_addr);
-       EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_38_addr);
-       EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_46_addr);
-       EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_47_addr);
-       EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_55_addr);
-       EXPECT_GT(1UL << 56, (unsigned long)mmap_addresses.on_56_addr);
-#endif
+       TEST_MMAPS;
 }
 
 TEST_HARNESS_MAIN
index c63c60b9397e7ff9d1949df11c229ae7388388bd..2ba3ec9900064daff126ffd3edcf099d37fd2f18 100644 (file)
@@ -6,30 +6,9 @@
 
 TEST(default_rlimit)
 {
-// Only works on 64 bit
-#if __riscv_xlen == 64
-       struct addresses mmap_addresses;
-
        EXPECT_EQ(TOP_DOWN, memory_layout());
 
-       do_mmaps(&mmap_addresses);
-
-       EXPECT_NE(MAP_FAILED, mmap_addresses.no_hint);
-       EXPECT_NE(MAP_FAILED, mmap_addresses.on_37_addr);
-       EXPECT_NE(MAP_FAILED, mmap_addresses.on_38_addr);
-       EXPECT_NE(MAP_FAILED, mmap_addresses.on_46_addr);
-       EXPECT_NE(MAP_FAILED, mmap_addresses.on_47_addr);
-       EXPECT_NE(MAP_FAILED, mmap_addresses.on_55_addr);
-       EXPECT_NE(MAP_FAILED, mmap_addresses.on_56_addr);
-
-       EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.no_hint);
-       EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_37_addr);
-       EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_38_addr);
-       EXPECT_GT(1UL << 38, (unsigned long)mmap_addresses.on_46_addr);
-       EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_47_addr);
-       EXPECT_GT(1UL << 47, (unsigned long)mmap_addresses.on_55_addr);
-       EXPECT_GT(1UL << 56, (unsigned long)mmap_addresses.on_56_addr);
-#endif
+       TEST_MMAPS;
 }
 
 TEST_HARNESS_MAIN
index 2e0db9c5be6c334f9ed7d0187fae6ed6de950745..3b29ca3bb3d40d1aa11433d7433ba62ad85e7cbe 100644 (file)
@@ -4,63 +4,86 @@
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <stddef.h>
+#include <strings.h>
+#include "../../kselftest_harness.h"
 
 #define TOP_DOWN 0
 #define BOTTOM_UP 1
 
-struct addresses {
-       int *no_hint;
-       int *on_37_addr;
-       int *on_38_addr;
-       int *on_46_addr;
-       int *on_47_addr;
-       int *on_55_addr;
-       int *on_56_addr;
+#if __riscv_xlen == 64
+uint64_t random_addresses[] = {
+       0x19764f0d73b3a9f0, 0x016049584cecef59, 0x3580bdd3562f4acd,
+       0x1164219f20b17da0, 0x07d97fcb40ff2373, 0x76ec528921272ee7,
+       0x4dd48c38a3de3f70, 0x2e11415055f6997d, 0x14b43334ac476c02,
+       0x375a60795aff19f6, 0x47f3051725b8ee1a, 0x4e697cf240494a9f,
+       0x456b59b5c2f9e9d1, 0x101724379d63cb96, 0x7fe9ad31619528c1,
+       0x2f417247c495c2ea, 0x329a5a5b82943a5e, 0x06d7a9d6adcd3827,
+       0x327b0b9ee37f62d5, 0x17c7b1851dfd9b76, 0x006ebb6456ec2cd9,
+       0x00836cd14146a134, 0x00e5c4dcde7126db, 0x004c29feadf75753,
+       0x00d8b20149ed930c, 0x00d71574c269387a, 0x0006ebe4a82acb7a,
+       0x0016135df51f471b, 0x00758bdb55455160, 0x00d0bdd949b13b32,
+       0x00ecea01e7c5f54b, 0x00e37b071b9948b1, 0x0011fdd00ff57ab3,
+       0x00e407294b52f5ea, 0x00567748c200ed20, 0x000d073084651046,
+       0x00ac896f4365463c, 0x00eb0d49a0b26216, 0x0066a2564a982a31,
+       0x002e0d20237784ae, 0x0000554ff8a77a76, 0x00006ce07a54c012,
+       0x000009570516d799, 0x00000954ca15b84d, 0x0000684f0d453379,
+       0x00002ae5816302b5, 0x0000042403fb54bf, 0x00004bad7392bf30,
+       0x00003e73bfa4b5e3, 0x00005442c29978e0, 0x00002803f11286b6,
+       0x000073875d745fc6, 0x00007cede9cb8240, 0x000027df84cc6a4f,
+       0x00006d7e0e74242a, 0x00004afd0b836e02, 0x000047d0e837cd82,
+       0x00003b42405efeda, 0x00001531bafa4c95, 0x00007172cae34ac4,
+};
+#else
+uint32_t random_addresses[] = {
+       0x8dc302e0, 0x929ab1e0, 0xb47683ba, 0xea519c73, 0xa19f1c90, 0xc49ba213,
+       0x8f57c625, 0xadfe5137, 0x874d4d95, 0xaa20f09d, 0xcf21ebfc, 0xda7737f1,
+       0xcedf392a, 0x83026c14, 0xccedca52, 0xc6ccf826, 0xe0cd9415, 0x997472ca,
+       0xa21a44c1, 0xe82196f5, 0xa23fd66b, 0xc28d5590, 0xd009cdce, 0xcf0be646,
+       0x8fc8c7ff, 0xe2a85984, 0xa3d3236b, 0x89a0619d, 0xc03db924, 0xb5d4cc1b,
+       0xb96ee04c, 0xd191da48, 0xb432a000, 0xaa2bebbc, 0xa2fcb289, 0xb0cca89b,
+       0xb0c18d6a, 0x88f58deb, 0xa4d42d1c, 0xe4d74e86, 0x99902b09, 0x8f786d31,
+       0xbec5e381, 0x9a727e65, 0xa9a65040, 0xa880d789, 0x8f1b335e, 0xfc821c1e,
+       0x97e34be4, 0xbbef84ed, 0xf447d197, 0xfd7ceee2, 0xe632348d, 0xee4590f4,
+       0x958992a5, 0xd57e05d6, 0xfd240970, 0xc5b0dcff, 0xd96da2c2, 0xa7ae041d,
 };
+#endif
 
 // Only works on 64 bit
 #if __riscv_xlen == 64
-static inline void do_mmaps(struct addresses *mmap_addresses)
-{
-       /*
-        * Place all of the hint addresses on the boundaries of mmap
-        * sv39, sv48, sv57
-        * User addresses end at 1<<38, 1<<47, 1<<56 respectively
-        */
-       void *on_37_bits = (void *)(1UL << 37);
-       void *on_38_bits = (void *)(1UL << 38);
-       void *on_46_bits = (void *)(1UL << 46);
-       void *on_47_bits = (void *)(1UL << 47);
-       void *on_55_bits = (void *)(1UL << 55);
-       void *on_56_bits = (void *)(1UL << 56);
+#define PROT (PROT_READ | PROT_WRITE)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
 
-       int prot = PROT_READ | PROT_WRITE;
-       int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+/* mmap must return a value that doesn't use more bits than the hint address. */
+static inline unsigned long get_max_value(unsigned long input)
+{
+       unsigned long max_bit = (1UL << (((sizeof(unsigned long) * 8) - 1 -
+                                         __builtin_clzl(input))));
 
-       mmap_addresses->no_hint =
-               mmap(NULL, 5 * sizeof(int), prot, flags, 0, 0);
-       mmap_addresses->on_37_addr =
-               mmap(on_37_bits, 5 * sizeof(int), prot, flags, 0, 0);
-       mmap_addresses->on_38_addr =
-               mmap(on_38_bits, 5 * sizeof(int), prot, flags, 0, 0);
-       mmap_addresses->on_46_addr =
-               mmap(on_46_bits, 5 * sizeof(int), prot, flags, 0, 0);
-       mmap_addresses->on_47_addr =
-               mmap(on_47_bits, 5 * sizeof(int), prot, flags, 0, 0);
-       mmap_addresses->on_55_addr =
-               mmap(on_55_bits, 5 * sizeof(int), prot, flags, 0, 0);
-       mmap_addresses->on_56_addr =
-               mmap(on_56_bits, 5 * sizeof(int), prot, flags, 0, 0);
+       return max_bit + (max_bit - 1);
 }
+
+#define TEST_MMAPS                                                            \
+       ({                                                                    \
+               void *mmap_addr;                                              \
+               for (int i = 0; i < ARRAY_SIZE(random_addresses); i++) {      \
+                       mmap_addr = mmap((void *)random_addresses[i],         \
+                                        5 * sizeof(int), PROT, FLAGS, 0, 0); \
+                       EXPECT_NE(MAP_FAILED, mmap_addr);                     \
+                       EXPECT_GE((void *)get_max_value(random_addresses[i]), \
+                                 mmap_addr);                                 \
+                       mmap_addr = mmap((void *)random_addresses[i],         \
+                                        5 * sizeof(int), PROT, FLAGS, 0, 0); \
+                       EXPECT_NE(MAP_FAILED, mmap_addr);                     \
+                       EXPECT_GE((void *)get_max_value(random_addresses[i]), \
+                                 mmap_addr);                                 \
+               }                                                             \
+       })
 #endif /* __riscv_xlen == 64 */
 
 static inline int memory_layout(void)
 {
-       int prot = PROT_READ | PROT_WRITE;
-       int flags = MAP_PRIVATE | MAP_ANONYMOUS;
-
-       void *value1 = mmap(NULL, sizeof(int), prot, flags, 0, 0);
-       void *value2 = mmap(NULL, sizeof(int), prot, flags, 0, 0);
+       void *value1 = mmap(NULL, sizeof(int), PROT, FLAGS, 0, 0);
+       void *value2 = mmap(NULL, sizeof(int), PROT, FLAGS, 0, 0);
 
        return value2 > value1;
 }