progress = 0;
        init_rss_vec(rss);
 
+       /*
+        * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
+        * error handling here, assume that exclusive mmap_lock on dst and src
+        * protects anon from unexpected THP transitions; with shmem and file
+        * protected by mmap_lock-less collapse skipping areas with anon_vma
+        * (whereas vma_needs_copy() skips areas without anon_vma).  A rework
+        * can remove such assumptions later, but this is good enough for now.
+        */
        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
        if (!dst_pte) {
                ret = -ENOMEM;
                goto out;
        }
-       src_pte = pte_offset_map(src_pmd, addr);
-       src_ptl = pte_lockptr(src_mm, src_pmd);
+       src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
+       if (!src_pte) {
+               pte_unmap_unlock(dst_pte, dst_ptl);
+               /* ret == 0 */
+               goto out;
+       }
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
        orig_src_pte = src_pte;
        orig_dst_pte = dst_pte;
        } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 
        arch_leave_lazy_mmu_mode();
-       spin_unlock(src_ptl);
-       pte_unmap(orig_src_pte);
+       pte_unmap_unlock(orig_src_pte, src_ptl);
        add_mm_rss_vec(dst_mm, rss);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();
        swp_entry_t entry;
 
        tlb_change_page_size(tlb, PAGE_SIZE);
-again:
        init_rss_vec(rss);
-       start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-       pte = start_pte;
+       start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       if (!pte)
+               return addr;
+
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        do {
         * If we forced a TLB flush (either due to running out of
         * batch buffers or because we needed to flush dirty TLB
         * entries before releasing the ptl), free the batched
-        * memory too. Restart if we didn't do everything.
+        * memory too. Come back again if we didn't do everything.
         */
-       if (force_flush) {
-               force_flush = 0;
+       if (force_flush)
                tlb_flush_mmu(tlb);
-       }
-
-       if (addr != end) {
-               cond_resched();
-               goto again;
-       }
 
        return addr;
 }
                if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                __split_huge_pmd(vma, pmd, addr, false, NULL);
-                       else if (zap_huge_pmd(tlb, vma, pmd, addr))
-                               goto next;
+                       else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
+                               addr = next;
+                               continue;
+                       }
                        /* fall through */
                } else if (details && details->single_folio &&
                           folio_test_pmd_mappable(details->single_folio) &&
                         */
                        spin_unlock(ptl);
                }
-
-               /*
-                * Here there can be other concurrent MADV_DONTNEED or
-                * trans huge page faults running, and if the pmd is
-                * none or trans huge it can change under us. This is
-                * because MADV_DONTNEED holds the mmap_lock in read
-                * mode.
-                */
-               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
-                       goto next;
-               next = zap_pte_range(tlb, vma, pmd, addr, next, details);
-next:
-               cond_resched();
-       } while (pmd++, addr = next, addr != end);
+               if (pmd_none(*pmd)) {
+                       addr = next;
+                       continue;
+               }
+               addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
+               if (addr != next)
+                       pmd--;
+       } while (pmd++, cond_resched(), addr != end);
 
        return addr;
 }
                const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
 
                start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
+               if (!start_pte) {
+                       ret = -EFAULT;
+                       goto out;
+               }
                for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
                        int err = insert_page_in_batch_locked(vma, pte,
                                addr, pages[curr_page_idx], prot);
                mapped_pte = pte = (mm == &init_mm) ?
                        pte_offset_kernel(pmd, addr) :
                        pte_offset_map_lock(mm, pmd, addr, &ptl);
+               if (!pte)
+                       return -EINVAL;
        }
 
-       BUG_ON(pmd_huge(*pmd));
-
        arch_enter_lazy_mmu_mode();
 
        if (fn) {
        int ret;
        void *kaddr;
        void __user *uaddr;
-       bool locked = false;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = vmf->address;
         * On architectures with software "accessed" bits, we would
         * take a double page fault, so mark it accessed here.
         */
+       vmf->pte = NULL;
        if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
                pte_t entry;
 
                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
-               locked = true;
-               if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+               if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) {
                        /*
                         * Other thread has already handled the fault
                         * and update local tlb only
         * zeroes.
         */
        if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
-               if (locked)
+               if (vmf->pte)
                        goto warn;
 
                /* Re-validate under PTL if the page is still mapped */
                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
-               locked = true;
-               if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+               if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte))) {
                        /* The PTE changed under us, update local tlb */
                        update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
        ret = 0;
 
 pte_unlock:
-       if (locked)
+       if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        kunmap_atomic(kaddr);
        flush_dcache_page(dst);
         * Re-check the pte - we dropped the lock
         */
        vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
-       if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+       if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte))) {
                if (old_folio) {
                        if (!folio_test_anon(old_folio)) {
                                dec_mm_counter(mm, mm_counter_file(&old_folio->page));
                /* Free the old page.. */
                new_folio = old_folio;
                page_copied = 1;
-       } else {
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
+       } else if (vmf->pte) {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
        }
 
-       if (new_folio)
-               folio_put(new_folio);
-
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
        /*
         * No need to double call mmu_notifier->invalidate_range() callback as
         * the above ptep_clear_flush_notify() did already call it.
         */
        mmu_notifier_invalidate_range_only_end(&range);
+
+       if (new_folio)
+               folio_put(new_folio);
        if (old_folio) {
                if (page_copied)
                        free_swap_cache(&old_folio->page);
        WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
                                       &vmf->ptl);
+       if (!vmf->pte)
+               return VM_FAULT_NOPAGE;
        /*
         * We might have raced with another page fault while we released the
         * pte_offset_map_lock.
 
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                                &vmf->ptl);
-       if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+       if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte)))
                restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
 
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       if (vmf->pte)
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
        folio_unlock(folio);
        folio_put(folio);
 
 {
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                       vmf->address, &vmf->ptl);
+       if (!vmf->pte)
+               return 0;
        /*
         * Be careful so that we will only recover a special uffd-wp pte into a
         * none pte.  Otherwise it means the pte could have changed, so retry.
                        vmf->page = pfn_swap_entry_to_page(entry);
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
-                       if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
+                       if (unlikely(!vmf->pte ||
+                                    !pte_same(*vmf->pte, vmf->orig_pte)))
                                goto unlock;
 
                        /*
                         */
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
-                       if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+                       if (likely(vmf->pte && pte_same(*vmf->pte, vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
                        goto unlock;
                }
         */
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
-       if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
+       if (unlikely(!vmf->pte || !pte_same(*vmf->pte, vmf->orig_pte)))
                goto out_nomap;
 
        if (unlikely(!folio_test_uptodate(folio))) {
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, vmf->address, vmf->pte);
 unlock:
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       if (vmf->pte)
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
        if (si)
                put_swap_device(si);
        return ret;
 out_nomap:
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       if (vmf->pte)
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
 out_page:
        folio_unlock(folio);
 out_release:
                return VM_FAULT_SIGBUS;
 
        /*
-        * Use pte_alloc() instead of pte_alloc_map().  We can't run
-        * pte_offset_map() on pmds where a huge pmd might be created
-        * from a different thread.
-        *
-        * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
-        * parallel threads are excluded by other means.
-        *
-        * Here we only have mmap_read_lock(mm).
+        * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
+        * be distinguished from a transient failure of pte_offset_map().
         */
        if (pte_alloc(vma->vm_mm, vmf->pmd))
                return VM_FAULT_OOM;
 
-       /* See comment in handle_pte_fault() */
-       if (unlikely(pmd_trans_unstable(vmf->pmd)))
-               return 0;
-
        /* Use the zero-page for reads */
        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm)) {
                                                vma->vm_page_prot));
                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                vmf->address, &vmf->ptl);
+               if (!vmf->pte)
+                       goto unlock;
                if (vmf_pte_changed(vmf)) {
                        update_mmu_tlb(vma, vmf->address, vmf->pte);
                        goto unlock;
 
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
+       if (!vmf->pte)
+               goto release;
        if (vmf_pte_changed(vmf)) {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                goto release;
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, vmf->address, vmf->pte);
 unlock:
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       if (vmf->pte)
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
 release:
        folio_put(folio);
                        return VM_FAULT_OOM;
        }
 
-       /*
-        * See comment in handle_pte_fault() for how this scenario happens, we
-        * need to return NOPAGE so that we drop this page.
-        */
-       if (pmd_devmap_trans_unstable(vmf->pmd))
-               return VM_FAULT_NOPAGE;
-
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                      vmf->address, &vmf->ptl);
+       if (!vmf->pte)
+               return VM_FAULT_NOPAGE;
 
        /* Re-check under ptl */
        if (likely(!vmf_pte_changed(vmf))) {
         * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
         */
        if (!vma->vm_ops->fault) {
-               /*
-                * If we find a migration pmd entry or a none pmd entry, which
-                * should never happen, return SIGBUS
-                */
-               if (unlikely(!pmd_present(*vmf->pmd)))
+               vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+                                              vmf->address, &vmf->ptl);
+               if (unlikely(!vmf->pte))
                        ret = VM_FAULT_SIGBUS;
                else {
-                       vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
-                                                      vmf->pmd,
-                                                      vmf->address,
-                                                      &vmf->ptl);
                        /*
                         * Make sure this is not a temporary clearing of pte
                         * by holding ptl and checking again. A R/M/W update
        pmd = pmd_offset(pud, address);
        VM_BUG_ON(pmd_trans_huge(*pmd));
 
-       if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-               goto out;
-
        ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
+       if (!ptep)
+               goto out;
        if (!pte_present(*ptep))
                goto unlock;
        *ptepp = ptep;