Userfaultfd is supposed to provide the full address (i.e., unmasked) of
the faulting access back to userspace.  However, that is not the case for
quite some time.
Even running "userfaultfd_demo" from the userfaultfd man page provides the
wrong output (and contradicts the man page).  Notice that
"UFFD_EVENT_PAGEFAULT event" shows the masked address (
7fc5e30b3000) and
not the first read address (0x7fc5e30b300f).
	Address returned by mmap() = 0x7fc5e30b3000
	fault_handler_thread():
	    poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
	    UFFD_EVENT_PAGEFAULT event: flags = 0; address = 
7fc5e30b3000
		(uffdio_copy.copy returned 4096)
	Read address 0x7fc5e30b300f in main(): A
	Read address 0x7fc5e30b340f in main(): A
	Read address 0x7fc5e30b380f in main(): A
	Read address 0x7fc5e30b3c0f in main(): A
The exact address is useful for various reasons and specifically for
prefetching decisions.  If it is known that the memory is populated by
certain objects whose size is not page-aligned, then based on the faulting
address, the uffd-monitor can decide whether to prefetch and prefault the
adjacent page.
This bug has been for quite some time in the kernel: since commit
1a29d85eb0f1 ("mm: use vmf->address instead of of vmf->virtual_address")
vmf->virtual_address"), which dates back to 2016.  A concern has been
raised that existing userspace application might rely on the old/wrong
behavior in which the address is masked.  Therefore, it was suggested to
provide the masked address unless the user explicitly asks for the exact
address.
Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct
userfaultfd to provide the exact address.  Add a new "real_address" field
to vmf to hold the unmasked address.  Provide the address to userspace
accordingly.
Initialize real_address in various code-paths to be consistent with
address, even when it is not used, to be on the safe side.
[namit@vmware.com: initialize real_address on all code paths, per Jan]
Link: https://lkml.kernel.org/r/20220226022655.350562-1-namit@vmware.com
[akpm@linux-foundation.org: fix typo in comment, per Jan]
Link: https://lkml.kernel.org/r/20220218041003.3508-1-namit@vmware.com
Signed-off-by: Nadav Amit <namit@vmware.com>
Acked-by: Peter Xu <peterx@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
        struct uffd_msg msg;
        msg_init(&msg);
        msg.event = UFFD_EVENT_PAGEFAULT;
+
+       if (!(features & UFFD_FEATURE_EXACT_ADDRESS))
+               address &= PAGE_MASK;
        msg.arg.pagefault.address = address;
        /*
         * These flags indicate why the userfault occurred:
 
        init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
        uwq.wq.private = current;
-       uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+       uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason,
                        ctx->features);
        uwq.ctx = ctx;
        uwq.waken = false;
 
                struct vm_area_struct *vma;     /* Target VMA */
                gfp_t gfp_mask;                 /* gfp mask to be used for allocations */
                pgoff_t pgoff;                  /* Logical page offset based on vma */
-               unsigned long address;          /* Faulting virtual address */
+               unsigned long address;          /* Faulting virtual address - masked */
+               unsigned long real_address;     /* Faulting virtual address - unmasked */
        };
        enum fault_flag flags;          /* FAULT_FLAG_xxx flags
                                         * XXX: should really be 'const' */
 
                           UFFD_FEATURE_SIGBUS |                \
                           UFFD_FEATURE_THREAD_ID |             \
                           UFFD_FEATURE_MINOR_HUGETLBFS |       \
-                          UFFD_FEATURE_MINOR_SHMEM)
+                          UFFD_FEATURE_MINOR_SHMEM |           \
+                          UFFD_FEATURE_EXACT_ADDRESS)
 #define UFFD_API_IOCTLS                                \
        ((__u64)1 << _UFFDIO_REGISTER |         \
         (__u64)1 << _UFFDIO_UNREGISTER |       \
         *
         * UFFD_FEATURE_MINOR_SHMEM indicates the same support as
         * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead.
+        *
+        * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page
+        * faults would be provided and the offset within the page would not be
+        * masked.
         */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP         (1<<0)
 #define UFFD_FEATURE_EVENT_FORK                        (1<<1)
 #define UFFD_FEATURE_THREAD_ID                 (1<<8)
 #define UFFD_FEATURE_MINOR_HUGETLBFS           (1<<9)
 #define UFFD_FEATURE_MINOR_SHMEM               (1<<10)
+#define UFFD_FEATURE_EXACT_ADDRESS             (1<<11)
        __u64 features;
 
        __u64 ioctls;
 
                                                  pgoff_t idx,
                                                  unsigned int flags,
                                                  unsigned long haddr,
+                                                 unsigned long addr,
                                                  unsigned long reason)
 {
        vm_fault_t ret;
        struct vm_fault vmf = {
                .vma = vma,
                .address = haddr,
+               .real_address = addr,
                .flags = flags,
 
                /*
                /* Check for page in userfault range */
                if (userfaultfd_missing(vma)) {
                        ret = hugetlb_handle_userfault(vma, mapping, idx,
-                                                      flags, haddr,
+                                                      flags, haddr, address,
                                                       VM_UFFD_MISSING);
                        goto out;
                }
                        unlock_page(page);
                        put_page(page);
                        ret = hugetlb_handle_userfault(vma, mapping, idx,
-                                                      flags, haddr,
+                                                      flags, haddr, address,
                                                       VM_UFFD_MINOR);
                        goto out;
                }
 
        struct vm_fault vmf = {
                .vma = vma,
                .address = address & PAGE_MASK,
+               .real_address = address,
                .flags = flags,
                .pgoff = linear_page_index(vma, address),
                .gfp_mask = __get_fault_gfp_mask(vma),
 
                        struct vm_fault vmf = {
                                .vma = vma,
                                .address = addr,
+                               .real_address = addr,
                                .pmd = pmd,
                        };