]> git.ipfire.org Git - people/arne_f/kernel.git/blobdiff - mm/shmem.c
uaccess: Add non-pagefault user-space read functions
[people/arne_f/kernel.git] / mm / shmem.c
index 07a1d22807beb7cfb5f85c3343a5eacdc28fb523..24005c3b345ca80687f841e483bd2a8ad053b4fc 100644 (file)
@@ -296,12 +296,14 @@ bool shmem_charge(struct inode *inode, long pages)
        if (!shmem_inode_acct_block(inode, pages))
                return false;
 
+       /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
+       inode->i_mapping->nrpages += pages;
+
        spin_lock_irqsave(&info->lock, flags);
        info->alloced += pages;
        inode->i_blocks += pages * BLOCKS_PER_PAGE;
        shmem_recalc_inode(inode);
        spin_unlock_irqrestore(&info->lock, flags);
-       inode->i_mapping->nrpages += pages;
 
        return true;
 }
@@ -311,6 +313,8 @@ void shmem_uncharge(struct inode *inode, long pages)
        struct shmem_inode_info *info = SHMEM_I(inode);
        unsigned long flags;
 
+       /* nrpages adjustment done by __delete_from_page_cache() or caller */
+
        spin_lock_irqsave(&info->lock, flags);
        info->alloced -= pages;
        inode->i_blocks -= pages * BLOCKS_PER_PAGE;
@@ -493,36 +497,45 @@ next:
                info = list_entry(pos, struct shmem_inode_info, shrinklist);
                inode = &info->vfs_inode;
 
-               if (nr_to_split && split >= nr_to_split) {
-                       iput(inode);
-                       continue;
-               }
+               if (nr_to_split && split >= nr_to_split)
+                       goto leave;
 
-               page = find_lock_page(inode->i_mapping,
+               page = find_get_page(inode->i_mapping,
                                (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
                if (!page)
                        goto drop;
 
+               /* No huge page at the end of the file: nothing to split */
                if (!PageTransHuge(page)) {
-                       unlock_page(page);
                        put_page(page);
                        goto drop;
                }
 
+               /*
+                * Leave the inode on the list if we failed to lock
+                * the page at this time.
+                *
+                * Waiting for the lock may lead to deadlock in the
+                * reclaim path.
+                */
+               if (!trylock_page(page)) {
+                       put_page(page);
+                       goto leave;
+               }
+
                ret = split_huge_page(page);
                unlock_page(page);
                put_page(page);
 
-               if (ret) {
-                       /* split failed: leave it on the list */
-                       iput(inode);
-                       continue;
-               }
+               /* If split failed leave the inode on the list */
+               if (ret)
+                       goto leave;
 
                split++;
 drop:
                list_del_init(&info->shrinklist);
                removed++;
+leave:
                iput(inode);
        }
 
@@ -1519,11 +1532,13 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 {
        struct page *oldpage, *newpage;
        struct address_space *swap_mapping;
+       swp_entry_t entry;
        pgoff_t swap_index;
        int error;
 
        oldpage = *pagep;
-       swap_index = page_private(oldpage);
+       entry.val = page_private(oldpage);
+       swap_index = swp_offset(entry);
        swap_mapping = page_mapping(oldpage);
 
        /*
@@ -1542,7 +1557,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
        __SetPageLocked(newpage);
        __SetPageSwapBacked(newpage);
        SetPageUptodate(newpage);
-       set_page_private(newpage, swap_index);
+       set_page_private(newpage, entry.val);
        SetPageSwapCache(newpage);
 
        /*
@@ -2037,9 +2052,10 @@ unsigned long shmem_get_unmapped_area(struct file *file,
        /*
         * Our priority is to support MAP_SHARED mapped hugely;
         * and support MAP_PRIVATE mapped hugely too, until it is COWed.
-        * But if caller specified an address hint, respect that as before.
+        * But if caller specified an address hint and we allocated area there
+        * successfully, respect that as before.
         */
-       if (uaddr)
+       if (uaddr == addr)
                return addr;
 
        if (shmem_huge != SHMEM_HUGE_FORCE) {
@@ -2073,7 +2089,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
        if (inflated_len < len)
                return addr;
 
-       inflated_addr = get_area(NULL, 0, inflated_len, 0, flags);
+       inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
        if (IS_ERR_VALUE(inflated_addr))
                return addr;
        if (inflated_addr & ~PAGE_MASK)
@@ -2113,7 +2129,11 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
        struct shmem_inode_info *info = SHMEM_I(inode);
        int retval = -ENOMEM;
 
-       spin_lock_irq(&info->lock);
+       /*
+        * What serializes the accesses to info->flags?
+        * ipc_lock_object() when called from shmctl_do_lock(),
+        * no serialization needed when called from shm_destroy().
+        */
        if (lock && !(info->flags & VM_LOCKED)) {
                if (!user_shm_lock(inode->i_size, user))
                        goto out_nomem;
@@ -2128,7 +2148,6 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
        retval = 0;
 
 out_nomem:
-       spin_unlock_irq(&info->lock);
        return retval;
 }
 
@@ -2198,6 +2217,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
                        mpol_shared_policy_init(&info->policy, NULL);
                        break;
                }
+
+               lockdep_annotate_inode_mutex_key(inode);
        } else
                shmem_free_inode(sb);
        return inode;
@@ -2227,6 +2248,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
        struct page *page;
        pte_t _dst_pte, *dst_pte;
        int ret;
+       pgoff_t offset, max_off;
 
        ret = -ENOMEM;
        if (!shmem_inode_acct_block(inode, 1))
@@ -2249,7 +2271,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
                                *pagep = page;
                                shmem_inode_unacct_blocks(inode, 1);
                                /* don't free the page */
-                               return -EFAULT;
+                               return -ENOENT;
                        }
                } else {                /* mfill_zeropage_atomic */
                        clear_highpage(page);
@@ -2264,6 +2286,12 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
        __SetPageSwapBacked(page);
        __SetPageUptodate(page);
 
+       ret = -EFAULT;
+       offset = linear_page_index(dst_vma, dst_addr);
+       max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+       if (unlikely(offset >= max_off))
+               goto out_release;
+
        ret = mem_cgroup_try_charge(page, dst_mm, gfp, &memcg, false);
        if (ret)
                goto out_release;
@@ -2281,19 +2309,35 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
        _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
        if (dst_vma->vm_flags & VM_WRITE)
                _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
+       else {
+               /*
+                * We don't set the pte dirty if the vma has no
+                * VM_WRITE permission, so mark the page dirty or it
+                * could be freed from under us. We could do it
+                * unconditionally before unlock_page(), but doing it
+                * only if VM_WRITE is not set is faster.
+                */
+               set_page_dirty(page);
+       }
 
-       ret = -EEXIST;
        dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+
+       ret = -EFAULT;
+       max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+       if (unlikely(offset >= max_off))
+               goto out_release_uncharge_unlock;
+
+       ret = -EEXIST;
        if (!pte_none(*dst_pte))
                goto out_release_uncharge_unlock;
 
        lru_cache_add_anon(page);
 
-       spin_lock(&info->lock);
+       spin_lock_irq(&info->lock);
        info->alloced++;
        inode->i_blocks += BLOCKS_PER_PAGE;
        shmem_recalc_inode(inode);
-       spin_unlock(&info->lock);
+       spin_unlock_irq(&info->lock);
 
        inc_mm_counter(dst_mm, mm_counter_file(page));
        page_add_file_rmap(page, false);
@@ -2301,13 +2345,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
 
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(dst_vma, dst_addr, dst_pte);
-       unlock_page(page);
        pte_unmap_unlock(dst_pte, ptl);
+       unlock_page(page);
        ret = 0;
 out:
        return ret;
 out_release_uncharge_unlock:
        pte_unmap_unlock(dst_pte, ptl);
+       ClearPageDirty(page);
+       delete_from_page_cache(page);
 out_release_uncharge:
        mem_cgroup_cancel_charge(page, memcg, false);
 out_release:
@@ -2579,9 +2625,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
        inode_lock(inode);
        /* We're holding i_mutex so we can access i_size directly */
 
-       if (offset < 0)
-               offset = -EINVAL;
-       else if (offset >= inode->i_size)
+       if (offset < 0 || offset >= inode->i_size)
                offset = -ENXIO;
        else {
                start = offset >> PAGE_SHIFT;
@@ -2617,31 +2661,33 @@ static void shmem_tag_pins(struct address_space *mapping)
        void **slot;
        pgoff_t start;
        struct page *page;
+       unsigned int tagged = 0;
 
        lru_add_drain();
        start = 0;
-       rcu_read_lock();
 
+       spin_lock_irq(&mapping->tree_lock);
        radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
-               page = radix_tree_deref_slot(slot);
+               page = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
                if (!page || radix_tree_exception(page)) {
                        if (radix_tree_deref_retry(page)) {
                                slot = radix_tree_iter_retry(&iter);
                                continue;
                        }
                } else if (page_count(page) - page_mapcount(page) > 1) {
-                       spin_lock_irq(&mapping->tree_lock);
                        radix_tree_tag_set(&mapping->page_tree, iter.index,
                                           SHMEM_TAG_PINNED);
-                       spin_unlock_irq(&mapping->tree_lock);
                }
 
-               if (need_resched()) {
-                       slot = radix_tree_iter_resume(slot, &iter);
-                       cond_resched_rcu();
-               }
+               if (++tagged % 1024)
+                       continue;
+
+               slot = radix_tree_iter_resume(slot, &iter);
+               spin_unlock_irq(&mapping->tree_lock);
+               cond_resched();
+               spin_lock_irq(&mapping->tree_lock);
        }
-       rcu_read_unlock();
+       spin_unlock_irq(&mapping->tree_lock);
 }
 
 /*
@@ -2853,7 +2899,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                }
 
                shmem_falloc.waitq = &shmem_falloc_waitq;
-               shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+               shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
                shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
                spin_lock(&inode->i_lock);
                inode->i_private = &shmem_falloc;
@@ -3056,16 +3102,20 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = d_inode(old_dentry);
-       int ret;
+       int ret = 0;
 
        /*
         * No ordinary (disk based) filesystem counts links as inodes;
         * but each new link needs a new dentry, pinning lowmem, and
         * tmpfs dentries cannot be pruned until they are unlinked.
+        * But if an O_TMPFILE file is linked into the tmpfs, the
+        * first link must skip that, to get the accounting right.
         */
-       ret = shmem_reserve_inode(inode->i_sb);
-       if (ret)
-               goto out;
+       if (inode->i_nlink) {
+               ret = shmem_reserve_inode(inode->i_sb);
+               if (ret)
+                       goto out;
+       }
 
        dir->i_size += BOGO_DIRENT_SIZE;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);