]>
| Commit | Line | Data |
|---|---|---|
| 1da177e4 LT |
1 | /* |
| 2 | * hugetlbpage-backed filesystem. Based on ramfs. | |
| 3 | * | |
| 6d49e352 | 4 | * Nadia Yvette Chambers, 2002 |
| 1da177e4 LT |
5 | * |
| 6 | * Copyright (C) 2002 Linus Torvalds. | |
| 3e89e1c5 | 7 | * License: GPL |
| 1da177e4 LT |
8 | */ |
| 9 | ||
| 9b857d26 AM |
10 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| 11 | ||
| 1da177e4 LT |
12 | #include <linux/thread_info.h> |
| 13 | #include <asm/current.h> | |
| 70c3547e | 14 | #include <linux/falloc.h> |
| 1da177e4 LT |
15 | #include <linux/fs.h> |
| 16 | #include <linux/mount.h> | |
| 17 | #include <linux/file.h> | |
| e73a75fa | 18 | #include <linux/kernel.h> |
| 1da177e4 LT |
19 | #include <linux/writeback.h> |
| 20 | #include <linux/pagemap.h> | |
| 21 | #include <linux/highmem.h> | |
| 22 | #include <linux/init.h> | |
| 23 | #include <linux/string.h> | |
| 16f7e0fe | 24 | #include <linux/capability.h> |
| e73a75fa | 25 | #include <linux/ctype.h> |
| 1da177e4 LT |
26 | #include <linux/backing-dev.h> |
| 27 | #include <linux/hugetlb.h> | |
| 28 | #include <linux/pagevec.h> | |
| 32021982 | 29 | #include <linux/fs_parser.h> |
| 036e0856 | 30 | #include <linux/mman.h> |
| 1da177e4 LT |
31 | #include <linux/slab.h> |
| 32 | #include <linux/dnotify.h> | |
| 33 | #include <linux/statfs.h> | |
| 34 | #include <linux/security.h> | |
| 1fd7317d | 35 | #include <linux/magic.h> |
| 290408d4 | 36 | #include <linux/migrate.h> |
| 34d0640e | 37 | #include <linux/uio.h> |
| 1da177e4 | 38 | |
| 7c0f6ba6 | 39 | #include <linux/uaccess.h> |
| 88590253 | 40 | #include <linux/sched/mm.h> |
| 1da177e4 | 41 | |
| 014ad7c4 HL |
42 | #define CREATE_TRACE_POINTS |
| 43 | #include <trace/events/hugetlbfs.h> | |
| 44 | ||
| f5e54d6e | 45 | static const struct address_space_operations hugetlbfs_aops; |
| 886b94d2 | 46 | static const struct file_operations hugetlbfs_file_operations; |
| 92e1d5be AV |
47 | static const struct inode_operations hugetlbfs_dir_inode_operations; |
| 48 | static const struct inode_operations hugetlbfs_inode_operations; | |
| 1da177e4 | 49 | |
| 32021982 DH |
50 | enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT }; |
| 51 | ||
| 52 | struct hugetlbfs_fs_context { | |
| 4a25220d | 53 | struct hstate *hstate; |
| 32021982 DH |
54 | unsigned long long max_size_opt; |
| 55 | unsigned long long min_size_opt; | |
| 4a25220d DH |
56 | long max_hpages; |
| 57 | long nr_inodes; | |
| 58 | long min_hpages; | |
| 32021982 DH |
59 | enum hugetlbfs_size_type max_val_type; |
| 60 | enum hugetlbfs_size_type min_val_type; | |
| 4a25220d DH |
61 | kuid_t uid; |
| 62 | kgid_t gid; | |
| 63 | umode_t mode; | |
| a1d776ee DG |
64 | }; |
| 65 | ||
| 1da177e4 LT |
66 | int sysctl_hugetlb_shm_group; |
| 67 | ||
| 32021982 DH |
68 | enum hugetlb_param { |
| 69 | Opt_gid, | |
| 70 | Opt_min_size, | |
| 71 | Opt_mode, | |
| 72 | Opt_nr_inodes, | |
| 73 | Opt_pagesize, | |
| 74 | Opt_size, | |
| 75 | Opt_uid, | |
| e73a75fa RD |
76 | }; |
| 77 | ||
| d7167b14 | 78 | static const struct fs_parameter_spec hugetlb_fs_parameters[] = { |
| eefc1324 | 79 | fsparam_gid ("gid", Opt_gid), |
| 32021982 | 80 | fsparam_string("min_size", Opt_min_size), |
| e0f7e2b2 | 81 | fsparam_u32oct("mode", Opt_mode), |
| 32021982 DH |
82 | fsparam_string("nr_inodes", Opt_nr_inodes), |
| 83 | fsparam_string("pagesize", Opt_pagesize), | |
| 84 | fsparam_string("size", Opt_size), | |
| eefc1324 | 85 | fsparam_uid ("uid", Opt_uid), |
| 32021982 DH |
86 | {} |
| 87 | }; | |
| 88 | ||
| 63489f8e MK |
89 | /* |
| 90 | * Mask used when checking the page offset value passed in via system | |
| 91 | * calls. This value will be converted to a loff_t which is signed. | |
| 92 | * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the | |
| 93 | * value. The extra bit (- 1 in the shift value) is to take the sign | |
| 94 | * bit into account. | |
| 95 | */ | |
| 96 | #define PGOFF_LOFFT_MAX \ | |
| 97 | (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) | |
| 98 | ||
| ea52cb24 | 99 | static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma) |
| 1da177e4 | 100 | { |
| ea52cb24 LS |
101 | /* Unfortunate we have to reassign vma->vm_private_data. */ |
| 102 | return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma); | |
| 103 | } | |
| 104 | ||
| 105 | static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) | |
| 106 | { | |
| 107 | struct file *file = desc->file; | |
| 496ad9aa | 108 | struct inode *inode = file_inode(file); |
| 1da177e4 LT |
109 | loff_t len, vma_len; |
| 110 | int ret; | |
| a5516438 | 111 | struct hstate *h = hstate_file(file); |
| e656c7a9 | 112 | vm_flags_t vm_flags; |
| 1da177e4 | 113 | |
| 68589bc3 | 114 | /* |
| dec4ad86 DG |
115 | * vma address alignment (but not the pgoff alignment) has |
| 116 | * already been checked by prepare_hugepage_range. If you add | |
| 117 | * any error returns here, do so after setting VM_HUGETLB, so | |
| 118 | * is_vm_hugetlb_page tests below unmap_region go the right | |
| 45e55300 | 119 | * way when do_mmap unwinds (may be important on powerpc |
| dec4ad86 | 120 | * and ia64). |
| 68589bc3 | 121 | */ |
| ea52cb24 LS |
122 | desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; |
| 123 | desc->vm_ops = &hugetlb_vm_ops; | |
| 1da177e4 | 124 | |
| 045c7a3f | 125 | /* |
| 63489f8e | 126 | * page based offset in vm_pgoff could be sufficiently large to |
| 5df63c2a MK |
127 | * overflow a loff_t when converted to byte offset. This can |
| 128 | * only happen on architectures where sizeof(loff_t) == | |
| 129 | * sizeof(unsigned long). So, only check in those instances. | |
| 045c7a3f | 130 | */ |
| 5df63c2a | 131 | if (sizeof(unsigned long) == sizeof(loff_t)) { |
| ea52cb24 | 132 | if (desc->pgoff & PGOFF_LOFFT_MAX) |
| 5df63c2a MK |
133 | return -EINVAL; |
| 134 | } | |
| 045c7a3f | 135 | |
| 63489f8e | 136 | /* must be huge page aligned */ |
| ea52cb24 | 137 | if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) |
| dec4ad86 DG |
138 | return -EINVAL; |
| 139 | ||
| ea52cb24 LS |
140 | vma_len = (loff_t)vma_desc_size(desc); |
| 141 | len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT); | |
| 045c7a3f MK |
142 | /* check for overflow */ |
| 143 | if (len < vma_len) | |
| 144 | return -EINVAL; | |
| 1da177e4 | 145 | |
| 5955102c | 146 | inode_lock(inode); |
| 1da177e4 | 147 | file_accessed(file); |
| 1da177e4 LT |
148 | |
| 149 | ret = -ENOMEM; | |
| e656c7a9 | 150 | |
| ea52cb24 | 151 | vm_flags = desc->vm_flags; |
| e656c7a9 PS |
152 | /* |
| 153 | * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip | |
| 154 | * reserving here. Note: only for SHM hugetlbfs file, the inode | |
| 155 | * flag S_PRIVATE is set. | |
| 156 | */ | |
| 157 | if (inode->i_flags & S_PRIVATE) | |
| 158 | vm_flags |= VM_NORESERVE; | |
| 159 | ||
| 986f5f2b | 160 | if (hugetlb_reserve_pages(inode, |
| ea52cb24 LS |
161 | desc->pgoff >> huge_page_order(h), |
| 162 | len >> huge_page_shift(h), desc, | |
| 163 | vm_flags) < 0) | |
| a43a8c39 | 164 | goto out; |
| b45b5bd6 | 165 | |
| 4c887265 | 166 | ret = 0; |
| ea52cb24 | 167 | if ((desc->vm_flags & VM_WRITE) && inode->i_size < len) |
| 045c7a3f | 168 | i_size_write(inode, len); |
| 1da177e4 | 169 | out: |
| 5955102c | 170 | inode_unlock(inode); |
| 1da177e4 | 171 | |
| ea52cb24 LS |
172 | if (!ret) { |
| 173 | /* Allocate the VMA lock after we set it up. */ | |
| 174 | desc->action.success_hook = hugetlb_file_mmap_prepare_success; | |
| 175 | /* | |
| 176 | * We cannot permit the rmap finding this VMA in the time | |
| 177 | * between the VMA being inserted into the VMA tree and the | |
| 178 | * completion/success hook being invoked. | |
| 179 | * | |
| 180 | * This is because we establish a per-VMA hugetlb lock which can | |
| 181 | * be raced by rmap. | |
| 182 | */ | |
| 183 | desc->action.hide_from_rmap_until_complete = true; | |
| 184 | } | |
| 1da177e4 LT |
185 | return ret; |
| 186 | } | |
| 187 | ||
| 188 | /* | |
| 3e4e28c5 | 189 | * Called under mmap_write_lock(mm). |
| 1da177e4 LT |
190 | */ |
| 191 | ||
| 4b439e25 | 192 | unsigned long |
| cc92882e OS |
193 | hugetlb_get_unmapped_area(struct file *file, unsigned long addr, |
| 194 | unsigned long len, unsigned long pgoff, | |
| 195 | unsigned long flags) | |
| 1da177e4 | 196 | { |
| 7bd3f1e1 | 197 | unsigned long addr0 = 0; |
| a5516438 | 198 | struct hstate *h = hstate_file(file); |
| 1da177e4 | 199 | |
| a5516438 | 200 | if (len & ~huge_page_mask(h)) |
| 1da177e4 | 201 | return -EINVAL; |
| eff41389 PX |
202 | if ((flags & MAP_FIXED) && (addr & ~huge_page_mask(h))) |
| 203 | return -EINVAL; | |
| 7bd3f1e1 OS |
204 | if (addr) |
| 205 | addr0 = ALIGN(addr, huge_page_size(h)); | |
| 1da177e4 | 206 | |
| 9ac09bb9 | 207 | return mm_get_unmapped_area_vmflags(file, addr0, len, pgoff, flags, 0); |
| 4b439e25 | 208 | } |
| 1da177e4 | 209 | |
| 38c1ddbd | 210 | /* |
| fcc09f5b | 211 | * Someone wants to read @bytes from a HWPOISON hugetlb @folio from @offset. |
| 38c1ddbd | 212 | * Returns the maximum number of bytes one can read without touching the 1st raw |
| fcc09f5b | 213 | * HWPOISON page. |
| 38c1ddbd | 214 | */ |
| fcc09f5b MWO |
215 | static size_t adjust_range_hwpoison(struct folio *folio, size_t offset, |
| 216 | size_t bytes) | |
| 38c1ddbd | 217 | { |
| a638ee7f DH |
218 | struct page *page = folio_page(folio, offset / PAGE_SIZE); |
| 219 | size_t safe_bytes; | |
| 38c1ddbd | 220 | |
| a638ee7f DH |
221 | if (is_raw_hwpoison_page_in_hugepage(page)) |
| 222 | return 0; | |
| 223 | /* Safe to read the remaining bytes in this page. */ | |
| 224 | safe_bytes = PAGE_SIZE - (offset % PAGE_SIZE); | |
| 225 | page++; | |
| 226 | ||
| 227 | /* Check each remaining page as long as we are not done yet. */ | |
| 228 | for (; safe_bytes < bytes; safe_bytes += PAGE_SIZE, page++) | |
| 38c1ddbd JY |
229 | if (is_raw_hwpoison_page_in_hugepage(page)) |
| 230 | break; | |
| 231 | ||
| a638ee7f | 232 | return min(safe_bytes, bytes); |
| 38c1ddbd JY |
233 | } |
| 234 | ||
| e63e1e5a BP |
235 | /* |
| 236 | * Support for read() - Find the page attached to f_mapping and copy out the | |
| 445c8098 | 237 | * data. This provides functionality similar to filemap_read(). |
| e63e1e5a | 238 | */ |
| 34d0640e | 239 | static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) |
| e63e1e5a | 240 | { |
| 34d0640e AV |
241 | struct file *file = iocb->ki_filp; |
| 242 | struct hstate *h = hstate_file(file); | |
| 243 | struct address_space *mapping = file->f_mapping; | |
| e63e1e5a | 244 | struct inode *inode = mapping->host; |
| 34d0640e AV |
245 | unsigned long index = iocb->ki_pos >> huge_page_shift(h); |
| 246 | unsigned long offset = iocb->ki_pos & ~huge_page_mask(h); | |
| e63e1e5a BP |
247 | unsigned long end_index; |
| 248 | loff_t isize; | |
| 249 | ssize_t retval = 0; | |
| 250 | ||
| 34d0640e | 251 | while (iov_iter_count(to)) { |
| a08c7193 | 252 | struct folio *folio; |
| 38c1ddbd | 253 | size_t nr, copied, want; |
| e63e1e5a BP |
254 | |
| 255 | /* nr is the maximum number of bytes to copy from this page */ | |
| a5516438 | 256 | nr = huge_page_size(h); |
| a05b0855 AK |
257 | isize = i_size_read(inode); |
| 258 | if (!isize) | |
| 34d0640e | 259 | break; |
| a05b0855 | 260 | end_index = (isize - 1) >> huge_page_shift(h); |
| 34d0640e AV |
261 | if (index > end_index) |
| 262 | break; | |
| 263 | if (index == end_index) { | |
| a5516438 | 264 | nr = ((isize - 1) & ~huge_page_mask(h)) + 1; |
| a05b0855 | 265 | if (nr <= offset) |
| 34d0640e | 266 | break; |
| e63e1e5a BP |
267 | } |
| 268 | nr = nr - offset; | |
| 269 | ||
| a08c7193 SK |
270 | /* Find the folio */ |
| 271 | folio = filemap_lock_hugetlb_folio(h, mapping, index); | |
| 272 | if (IS_ERR(folio)) { | |
| e63e1e5a BP |
273 | /* |
| 274 | * We have a HOLE, zero out the user-buffer for the | |
| 275 | * length of the hole or request. | |
| 276 | */ | |
| 34d0640e | 277 | copied = iov_iter_zero(nr, to); |
| e63e1e5a | 278 | } else { |
| a08c7193 | 279 | folio_unlock(folio); |
| a05b0855 | 280 | |
| 19d3e221 | 281 | if (!folio_test_hwpoison(folio)) |
| 38c1ddbd JY |
282 | want = nr; |
| 283 | else { | |
| 284 | /* | |
| 285 | * Adjust how many bytes safe to read without | |
| fcc09f5b | 286 | * touching the 1st raw HWPOISON page after |
| 38c1ddbd JY |
287 | * offset. |
| 288 | */ | |
| fcc09f5b | 289 | want = adjust_range_hwpoison(folio, offset, nr); |
| 38c1ddbd | 290 | if (want == 0) { |
| a08c7193 | 291 | folio_put(folio); |
| 38c1ddbd JY |
292 | retval = -EIO; |
| 293 | break; | |
| 294 | } | |
| 8625147c JH |
295 | } |
| 296 | ||
| e63e1e5a | 297 | /* |
| a08c7193 | 298 | * We have the folio, copy it to user space buffer. |
| e63e1e5a | 299 | */ |
| a08c7193 SK |
300 | copied = copy_folio_to_iter(folio, offset, want, to); |
| 301 | folio_put(folio); | |
| e63e1e5a | 302 | } |
| 34d0640e AV |
303 | offset += copied; |
| 304 | retval += copied; | |
| 305 | if (copied != nr && iov_iter_count(to)) { | |
| 306 | if (!retval) | |
| 307 | retval = -EFAULT; | |
| 308 | break; | |
| e63e1e5a | 309 | } |
| a5516438 AK |
310 | index += offset >> huge_page_shift(h); |
| 311 | offset &= ~huge_page_mask(h); | |
| e63e1e5a | 312 | } |
| 34d0640e | 313 | iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; |
| e63e1e5a BP |
314 | return retval; |
| 315 | } | |
| 316 | ||
| e9d8e2bf | 317 | static int hugetlbfs_write_begin(const struct kiocb *iocb, |
| 800d15a5 | 318 | struct address_space *mapping, |
| 9d6b0cd7 | 319 | loff_t pos, unsigned len, |
| 1da86618 | 320 | struct folio **foliop, void **fsdata) |
| 1da177e4 LT |
321 | { |
| 322 | return -EINVAL; | |
| 323 | } | |
| 324 | ||
| e9d8e2bf TC |
325 | static int hugetlbfs_write_end(const struct kiocb *iocb, |
| 326 | struct address_space *mapping, | |
| 327 | loff_t pos, unsigned len, unsigned copied, | |
| 328 | struct folio *folio, void *fsdata) | |
| 1da177e4 | 329 | { |
| 800d15a5 | 330 | BUG(); |
| 1da177e4 LT |
331 | return -EINVAL; |
| 332 | } | |
| 333 | ||
| ece62684 | 334 | static void hugetlb_delete_from_page_cache(struct folio *folio) |
| 1da177e4 | 335 | { |
| ece62684 SK |
336 | folio_clear_dirty(folio); |
| 337 | folio_clear_uptodate(folio); | |
| 338 | filemap_remove_folio(folio); | |
| 1da177e4 LT |
339 | } |
| 340 | ||
| 378397cc MK |
341 | /* |
| 342 | * Called with i_mmap_rwsem held for inode based vma maps. This makes | |
| 343 | * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault | |
| 344 | * mutex for the page in the mapping. So, we can not race with page being | |
| 345 | * faulted into the vma. | |
| 346 | */ | |
| fa17ad58 MWO |
347 | static bool hugetlb_vma_maps_pfn(struct vm_area_struct *vma, |
| 348 | unsigned long addr, unsigned long pfn) | |
| 378397cc MK |
349 | { |
| 350 | pte_t *ptep, pte; | |
| 351 | ||
| 9c67a207 | 352 | ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma))); |
| 378397cc MK |
353 | if (!ptep) |
| 354 | return false; | |
| 355 | ||
| e6c0c032 | 356 | pte = huge_ptep_get(vma->vm_mm, addr, ptep); |
| 378397cc MK |
357 | if (huge_pte_none(pte) || !pte_present(pte)) |
| 358 | return false; | |
| 359 | ||
| fa17ad58 | 360 | if (pte_pfn(pte) == pfn) |
| 378397cc MK |
361 | return true; |
| 362 | ||
| 363 | return false; | |
| 364 | } | |
| 365 | ||
| 366 | /* | |
| 367 | * Can vma_offset_start/vma_offset_end overflow on 32-bit arches? | |
| 368 | * No, because the interval tree returns us only those vmas | |
| 369 | * which overlap the truncated area starting at pgoff, | |
| 370 | * and no vma on a 32-bit arch can span beyond the 4GB. | |
| 371 | */ | |
| 372 | static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) | |
| 373 | { | |
| 243b1f2d PX |
374 | unsigned long offset = 0; |
| 375 | ||
| 378397cc | 376 | if (vma->vm_pgoff < start) |
| 243b1f2d PX |
377 | offset = (start - vma->vm_pgoff) << PAGE_SHIFT; |
| 378 | ||
| 379 | return vma->vm_start + offset; | |
| 378397cc MK |
380 | } |
| 381 | ||
| 382 | static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) | |
| 383 | { | |
| 384 | unsigned long t_end; | |
| 385 | ||
| 386 | if (!end) | |
| 387 | return vma->vm_end; | |
| 388 | ||
| 389 | t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; | |
| 390 | if (t_end > vma->vm_end) | |
| 391 | t_end = vma->vm_end; | |
| 392 | return t_end; | |
| 393 | } | |
| 394 | ||
| 395 | /* | |
| 396 | * Called with hugetlb fault mutex held. Therefore, no more mappings to | |
| 397 | * this folio can be created while executing the routine. | |
| 398 | */ | |
| 399 | static void hugetlb_unmap_file_folio(struct hstate *h, | |
| 400 | struct address_space *mapping, | |
| 401 | struct folio *folio, pgoff_t index) | |
| 402 | { | |
| 403 | struct rb_root_cached *root = &mapping->i_mmap; | |
| 40549ba8 | 404 | struct hugetlb_vma_lock *vma_lock; |
| fa17ad58 | 405 | unsigned long pfn = folio_pfn(folio); |
| 378397cc MK |
406 | struct vm_area_struct *vma; |
| 407 | unsigned long v_start; | |
| 408 | unsigned long v_end; | |
| 409 | pgoff_t start, end; | |
| 410 | ||
| 411 | start = index * pages_per_huge_page(h); | |
| 412 | end = (index + 1) * pages_per_huge_page(h); | |
| 413 | ||
| 414 | i_mmap_lock_write(mapping); | |
| 40549ba8 MK |
415 | retry: |
| 416 | vma_lock = NULL; | |
| 378397cc MK |
417 | vma_interval_tree_foreach(vma, root, start, end - 1) { |
| 418 | v_start = vma_offset_start(vma, start); | |
| 419 | v_end = vma_offset_end(vma, end); | |
| 420 | ||
| fa17ad58 | 421 | if (!hugetlb_vma_maps_pfn(vma, v_start, pfn)) |
| 378397cc MK |
422 | continue; |
| 423 | ||
| 40549ba8 MK |
424 | if (!hugetlb_vma_trylock_write(vma)) { |
| 425 | vma_lock = vma->vm_private_data; | |
| 426 | /* | |
| 427 | * If we can not get vma lock, we need to drop | |
| 428 | * immap_sema and take locks in order. First, | |
| 429 | * take a ref on the vma_lock structure so that | |
| 430 | * we can be guaranteed it will not go away when | |
| 431 | * dropping immap_sema. | |
| 432 | */ | |
| 433 | kref_get(&vma_lock->refs); | |
| 434 | break; | |
| 435 | } | |
| 436 | ||
| 243b1f2d PX |
437 | unmap_hugepage_range(vma, v_start, v_end, NULL, |
| 438 | ZAP_FLAG_DROP_MARKER); | |
| 40549ba8 | 439 | hugetlb_vma_unlock_write(vma); |
| 378397cc MK |
440 | } |
| 441 | ||
| 442 | i_mmap_unlock_write(mapping); | |
| 40549ba8 MK |
443 | |
| 444 | if (vma_lock) { | |
| 445 | /* | |
| 446 | * Wait on vma_lock. We know it is still valid as we have | |
| 447 | * a reference. We must 'open code' vma locking as we do | |
| 448 | * not know if vma_lock is still attached to vma. | |
| 449 | */ | |
| 450 | down_write(&vma_lock->rw_sema); | |
| 451 | i_mmap_lock_write(mapping); | |
| 452 | ||
| 453 | vma = vma_lock->vma; | |
| 454 | if (!vma) { | |
| 455 | /* | |
| 456 | * If lock is no longer attached to vma, then just | |
| 457 | * unlock, drop our reference and retry looking for | |
| 458 | * other vmas. | |
| 459 | */ | |
| 460 | up_write(&vma_lock->rw_sema); | |
| 461 | kref_put(&vma_lock->refs, hugetlb_vma_lock_release); | |
| 462 | goto retry; | |
| 463 | } | |
| 464 | ||
| 465 | /* | |
| 466 | * vma_lock is still attached to vma. Check to see if vma | |
| 467 | * still maps page and if so, unmap. | |
| 468 | */ | |
| 469 | v_start = vma_offset_start(vma, start); | |
| 470 | v_end = vma_offset_end(vma, end); | |
| fa17ad58 | 471 | if (hugetlb_vma_maps_pfn(vma, v_start, pfn)) |
| 243b1f2d PX |
472 | unmap_hugepage_range(vma, v_start, v_end, NULL, |
| 473 | ZAP_FLAG_DROP_MARKER); | |
| 40549ba8 MK |
474 | |
| 475 | kref_put(&vma_lock->refs, hugetlb_vma_lock_release); | |
| 476 | hugetlb_vma_unlock_write(vma); | |
| 477 | ||
| 478 | goto retry; | |
| 479 | } | |
| 378397cc MK |
480 | } |
| 481 | ||
| 4aae8d1c | 482 | static void |
| 05e90bd0 PX |
483 | hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, |
| 484 | zap_flags_t zap_flags) | |
| 4aae8d1c MK |
485 | { |
| 486 | struct vm_area_struct *vma; | |
| 487 | ||
| 488 | /* | |
| d6aba4c8 SC |
489 | * end == 0 indicates that the entire range after start should be |
| 490 | * unmapped. Note, end is exclusive, whereas the interval tree takes | |
| 491 | * an inclusive "last". | |
| 4aae8d1c | 492 | */ |
| d6aba4c8 | 493 | vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { |
| 378397cc | 494 | unsigned long v_start; |
| 4aae8d1c MK |
495 | unsigned long v_end; |
| 496 | ||
| 40549ba8 MK |
497 | if (!hugetlb_vma_trylock_write(vma)) |
| 498 | continue; | |
| 499 | ||
| 378397cc MK |
500 | v_start = vma_offset_start(vma, start); |
| 501 | v_end = vma_offset_end(vma, end); | |
| 4aae8d1c | 502 | |
| 243b1f2d | 503 | unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags); |
| 40549ba8 MK |
504 | |
| 505 | /* | |
| 506 | * Note that vma lock only exists for shared/non-private | |
| 507 | * vmas. Therefore, lock is not held when calling | |
| 508 | * unmap_hugepage_range for private vmas. | |
| 509 | */ | |
| 510 | hugetlb_vma_unlock_write(vma); | |
| 4aae8d1c MK |
511 | } |
| 512 | } | |
| b5cec28d | 513 | |
| c8627228 MK |
514 | /* |
| 515 | * Called with hugetlb fault mutex held. | |
| 516 | * Returns true if page was actually removed, false otherwise. | |
| 517 | */ | |
| 518 | static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, | |
| 519 | struct address_space *mapping, | |
| 520 | struct folio *folio, pgoff_t index, | |
| 521 | bool truncate_op) | |
| 522 | { | |
| 523 | bool ret = false; | |
| 524 | ||
| 525 | /* | |
| 526 | * If folio is mapped, it was faulted in after being | |
| 7b738765 JT |
527 | * unmapped in caller or hugetlb_vmdelete_list() skips |
| 528 | * unmapping it due to fail to grab lock. Unmap (again) | |
| 529 | * while holding the fault mutex. The mutex will prevent | |
| 530 | * faults until we finish removing the folio. Hold folio | |
| 531 | * lock to guarantee no concurrent migration. | |
| c8627228 | 532 | */ |
| 7b738765 | 533 | folio_lock(folio); |
| 378397cc MK |
534 | if (unlikely(folio_mapped(folio))) |
| 535 | hugetlb_unmap_file_folio(h, mapping, folio, index); | |
| c8627228 | 536 | |
| c8627228 | 537 | /* |
| fa27759a MK |
538 | * We must remove the folio from page cache before removing |
| 539 | * the region/ reserve map (hugetlb_unreserve_pages). In | |
| 540 | * rare out of memory conditions, removal of the region/reserve | |
| 541 | * map could fail. Correspondingly, the subpool and global | |
| 542 | * reserve usage count can need to be adjusted. | |
| c8627228 | 543 | */ |
| ece62684 SK |
544 | VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio); |
| 545 | hugetlb_delete_from_page_cache(folio); | |
| fa27759a MK |
546 | ret = true; |
| 547 | if (!truncate_op) { | |
| 548 | if (unlikely(hugetlb_unreserve_pages(inode, index, | |
| 549 | index + 1, 1))) | |
| 550 | hugetlb_fix_reserve_counts(inode); | |
| c8627228 MK |
551 | } |
| 552 | ||
| 553 | folio_unlock(folio); | |
| 554 | return ret; | |
| 555 | } | |
| 556 | ||
| b5cec28d MK |
557 | /* |
| 558 | * remove_inode_hugepages handles two distinct cases: truncation and hole | |
| 559 | * punch. There are subtle differences in operation for each case. | |
| 4aae8d1c | 560 | * |
| b5cec28d MK |
561 | * truncation is indicated by end of range being LLONG_MAX |
| 562 | * In this case, we first scan the range and release found pages. | |
| 1935ebd3 | 563 | * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve |
| c8627228 MK |
564 | * maps and global counts. Page faults can race with truncation. |
| 565 | * During faults, hugetlb_no_page() checks i_size before page allocation, | |
| 566 | * and again after obtaining page table lock. It will 'back out' | |
| 567 | * allocations in the truncated range. | |
| b5cec28d MK |
568 | * hole punch is indicated if end is not LLONG_MAX |
| 569 | * In the hole punch case we scan the range and release found pages. | |
| 1935ebd3 ML |
570 | * Only when releasing a page is the associated region/reserve map |
| 571 | * deleted. The region/reserve map for ranges without associated | |
| e7c58097 MK |
572 | * pages are not modified. Page faults can race with hole punch. |
| 573 | * This is indicated if we find a mapped page. | |
| b5cec28d MK |
574 | * Note: If the passed end of range value is beyond the end of file, but |
| 575 | * not LLONG_MAX this routine still performs a hole punch operation. | |
| 576 | */ | |
| 577 | static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |
| 578 | loff_t lend) | |
| 1da177e4 | 579 | { |
| a5516438 | 580 | struct hstate *h = hstate_inode(inode); |
| b45b5bd6 | 581 | struct address_space *mapping = &inode->i_data; |
| a08c7193 | 582 | const pgoff_t end = lend >> PAGE_SHIFT; |
| 1508062e | 583 | struct folio_batch fbatch; |
| d72dc8a2 | 584 | pgoff_t next, index; |
| a43a8c39 | 585 | int i, freed = 0; |
| b5cec28d | 586 | bool truncate_op = (lend == LLONG_MAX); |
| 1da177e4 | 587 | |
| 1508062e | 588 | folio_batch_init(&fbatch); |
| a08c7193 | 589 | next = lstart >> PAGE_SHIFT; |
| 1508062e MWO |
590 | while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { |
| 591 | for (i = 0; i < folio_batch_count(&fbatch); ++i) { | |
| 592 | struct folio *folio = fbatch.folios[i]; | |
| d4241a04 | 593 | u32 hash = 0; |
| b5cec28d | 594 | |
| a08c7193 | 595 | index = folio->index >> huge_page_order(h); |
| 188a3972 MK |
596 | hash = hugetlb_fault_mutex_hash(mapping, index); |
| 597 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | |
| e7c58097 | 598 | |
| 4aae8d1c | 599 | /* |
| c8627228 | 600 | * Remove folio that was part of folio_batch. |
| 4aae8d1c | 601 | */ |
| c8627228 MK |
602 | if (remove_inode_single_folio(h, inode, mapping, folio, |
| 603 | index, truncate_op)) | |
| 604 | freed++; | |
| 605 | ||
| 188a3972 | 606 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
| 1da177e4 | 607 | } |
| 1508062e | 608 | folio_batch_release(&fbatch); |
| 1817889e | 609 | cond_resched(); |
| 1da177e4 | 610 | } |
| b5cec28d MK |
611 | |
| 612 | if (truncate_op) | |
| a08c7193 SK |
613 | (void)hugetlb_unreserve_pages(inode, |
| 614 | lstart >> huge_page_shift(h), | |
| 615 | LONG_MAX, freed); | |
| 1da177e4 LT |
616 | } |
| 617 | ||
| 2bbbda30 | 618 | static void hugetlbfs_evict_inode(struct inode *inode) |
| 1da177e4 | 619 | { |
| 9119a41e JK |
620 | struct resv_map *resv_map; |
| 621 | ||
| 014ad7c4 | 622 | trace_hugetlbfs_evict_inode(inode); |
| b5cec28d | 623 | remove_inode_hugepages(inode, 0, LLONG_MAX); |
| f27a5136 MK |
624 | |
| 625 | /* | |
| 626 | * Get the resv_map from the address space embedded in the inode. | |
| 627 | * This is the address space which points to any resv_map allocated | |
| 628 | * at inode creation time. If this is a device special inode, | |
| 629 | * i_mapping may not point to the original address space. | |
| 630 | */ | |
| 600f111e | 631 | resv_map = (struct resv_map *)(&inode->i_data)->i_private_data; |
| f27a5136 | 632 | /* Only regular and link inodes have associated reserve maps */ |
| 9119a41e JK |
633 | if (resv_map) |
| 634 | resv_map_release(&resv_map->refs); | |
| dbd5768f | 635 | clear_inode(inode); |
| 149f4211 CH |
636 | } |
| 637 | ||
| e5d319de | 638 | static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) |
| 1da177e4 | 639 | { |
| 856fc295 | 640 | pgoff_t pgoff; |
| 1da177e4 | 641 | struct address_space *mapping = inode->i_mapping; |
| a5516438 | 642 | struct hstate *h = hstate_inode(inode); |
| 1da177e4 | 643 | |
| a5516438 | 644 | BUG_ON(offset & ~huge_page_mask(h)); |
| 856fc295 | 645 | pgoff = offset >> PAGE_SHIFT; |
| 1da177e4 | 646 | |
| 87bf91d3 | 647 | i_size_write(inode, offset); |
| 188a3972 | 648 | i_mmap_lock_write(mapping); |
| f808c13f | 649 | if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) |
| 05e90bd0 PX |
650 | hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, |
| 651 | ZAP_FLAG_DROP_MARKER); | |
| c86aa7bb | 652 | i_mmap_unlock_write(mapping); |
| e7c58097 | 653 | remove_inode_hugepages(inode, offset, LLONG_MAX); |
| 1da177e4 LT |
654 | } |
| 655 | ||
| 68d32527 MK |
656 | static void hugetlbfs_zero_partial_page(struct hstate *h, |
| 657 | struct address_space *mapping, | |
| 658 | loff_t start, | |
| 659 | loff_t end) | |
| 660 | { | |
| 661 | pgoff_t idx = start >> huge_page_shift(h); | |
| 662 | struct folio *folio; | |
| 663 | ||
| a08c7193 | 664 | folio = filemap_lock_hugetlb_folio(h, mapping, idx); |
| 66dabbb6 | 665 | if (IS_ERR(folio)) |
| 68d32527 MK |
666 | return; |
| 667 | ||
| 668 | start = start & ~huge_page_mask(h); | |
| 669 | end = end & ~huge_page_mask(h); | |
| 670 | if (!end) | |
| 671 | end = huge_page_size(h); | |
| 672 | ||
| 673 | folio_zero_segment(folio, (size_t)start, (size_t)end); | |
| 674 | ||
| 675 | folio_unlock(folio); | |
| 676 | folio_put(folio); | |
| 677 | } | |
| 678 | ||
| 70c3547e MK |
679 | static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) |
| 680 | { | |
| 68d32527 MK |
681 | struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); |
| 682 | struct address_space *mapping = inode->i_mapping; | |
| 70c3547e MK |
683 | struct hstate *h = hstate_inode(inode); |
| 684 | loff_t hpage_size = huge_page_size(h); | |
| 685 | loff_t hole_start, hole_end; | |
| 686 | ||
| 687 | /* | |
| 68d32527 | 688 | * hole_start and hole_end indicate the full pages within the hole. |
| 70c3547e MK |
689 | */ |
| 690 | hole_start = round_up(offset, hpage_size); | |
| 691 | hole_end = round_down(offset + len, hpage_size); | |
| 692 | ||
| 68d32527 MK |
693 | inode_lock(inode); |
| 694 | ||
| 695 | /* protected by i_rwsem */ | |
| 696 | if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { | |
| 697 | inode_unlock(inode); | |
| 698 | return -EPERM; | |
| 699 | } | |
| 70c3547e | 700 | |
| 68d32527 | 701 | i_mmap_lock_write(mapping); |
| ff62a342 | 702 | |
| 68d32527 MK |
703 | /* If range starts before first full page, zero partial page. */ |
| 704 | if (offset < hole_start) | |
| 705 | hugetlbfs_zero_partial_page(h, mapping, | |
| 706 | offset, min(offset + len, hole_start)); | |
| ff62a342 | 707 | |
| 68d32527 MK |
708 | /* Unmap users of full pages in the hole. */ |
| 709 | if (hole_end > hole_start) { | |
| f808c13f | 710 | if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) |
| 70c3547e | 711 | hugetlb_vmdelete_list(&mapping->i_mmap, |
| 05e90bd0 PX |
712 | hole_start >> PAGE_SHIFT, |
| 713 | hole_end >> PAGE_SHIFT, 0); | |
| 70c3547e MK |
714 | } |
| 715 | ||
| 68d32527 MK |
716 | /* If range extends beyond last full page, zero partial page. */ |
| 717 | if ((offset + len) > hole_end && (offset + len) > hole_start) | |
| 718 | hugetlbfs_zero_partial_page(h, mapping, | |
| 719 | hole_end, offset + len); | |
| 720 | ||
| 721 | i_mmap_unlock_write(mapping); | |
| 722 | ||
| 723 | /* Remove full pages from the file. */ | |
| 724 | if (hole_end > hole_start) | |
| 725 | remove_inode_hugepages(inode, hole_start, hole_end); | |
| 726 | ||
| 727 | inode_unlock(inode); | |
| 728 | ||
| 70c3547e MK |
729 | return 0; |
| 730 | } | |
| 731 | ||
| 732 | static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, | |
| 733 | loff_t len) | |
| 734 | { | |
| 735 | struct inode *inode = file_inode(file); | |
| ff62a342 | 736 | struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); |
| 70c3547e MK |
737 | struct address_space *mapping = inode->i_mapping; |
| 738 | struct hstate *h = hstate_inode(inode); | |
| 739 | struct vm_area_struct pseudo_vma; | |
| 740 | struct mm_struct *mm = current->mm; | |
| 741 | loff_t hpage_size = huge_page_size(h); | |
| 742 | unsigned long hpage_shift = huge_page_shift(h); | |
| 743 | pgoff_t start, index, end; | |
| 744 | int error; | |
| 745 | u32 hash; | |
| 746 | ||
| 747 | if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) | |
| 748 | return -EOPNOTSUPP; | |
| 749 | ||
| 014ad7c4 HL |
750 | if (mode & FALLOC_FL_PUNCH_HOLE) { |
| 751 | error = hugetlbfs_punch_hole(inode, offset, len); | |
| 752 | goto out_nolock; | |
| 753 | } | |
| 70c3547e MK |
754 | |
| 755 | /* | |
| 756 | * Default preallocate case. | |
| 757 | * For this range, start is rounded down and end is rounded up | |
| 758 | * as well as being converted to page offsets. | |
| 759 | */ | |
| 760 | start = offset >> hpage_shift; | |
| 761 | end = (offset + len + hpage_size - 1) >> hpage_shift; | |
| 762 | ||
| 5955102c | 763 | inode_lock(inode); |
| 70c3547e MK |
764 | |
| 765 | /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ | |
| 766 | error = inode_newsize_ok(inode, offset + len); | |
| 767 | if (error) | |
| 768 | goto out; | |
| 769 | ||
| ff62a342 MAL |
770 | if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { |
| 771 | error = -EPERM; | |
| 772 | goto out; | |
| 773 | } | |
| 774 | ||
| 70c3547e MK |
775 | /* |
| 776 | * Initialize a pseudo vma as this is required by the huge page | |
| 10969b55 | 777 | * allocation routines. |
| 70c3547e | 778 | */ |
| 2c4541e2 | 779 | vma_init(&pseudo_vma, mm); |
| 1c71222e | 780 | vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); |
| 70c3547e MK |
781 | pseudo_vma.vm_file = file; |
| 782 | ||
| 783 | for (index = start; index < end; index++) { | |
| 784 | /* | |
| 785 | * This is supposed to be the vaddr where the page is being | |
| 786 | * faulted in, but we have no vaddr here. | |
| 787 | */ | |
| d0ce0e47 | 788 | struct folio *folio; |
| 70c3547e | 789 | unsigned long addr; |
| 70c3547e MK |
790 | |
| 791 | cond_resched(); | |
| 792 | ||
| 793 | /* | |
| 794 | * fallocate(2) manpage permits EINTR; we may have been | |
| 795 | * interrupted because we are using up too much memory. | |
| 796 | */ | |
| 797 | if (signal_pending(current)) { | |
| 798 | error = -EINTR; | |
| 799 | break; | |
| 800 | } | |
| 801 | ||
| 70c3547e MK |
802 | /* addr is the offset within the file (zero based) */ |
| 803 | addr = index * hpage_size; | |
| 804 | ||
| 188a3972 | 805 | /* mutex taken here, fault path and hole punch */ |
| 188b04a7 | 806 | hash = hugetlb_fault_mutex_hash(mapping, index); |
| 70c3547e MK |
807 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
| 808 | ||
| 809 | /* See if already present in mapping to avoid alloc/free */ | |
| a08c7193 | 810 | folio = filemap_get_folio(mapping, index << huge_page_order(h)); |
| fd4aed8d MK |
811 | if (!IS_ERR(folio)) { |
| 812 | folio_put(folio); | |
| 70c3547e | 813 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
| 70c3547e MK |
814 | continue; |
| 815 | } | |
| 816 | ||
| 88ce3fef | 817 | /* |
| d0ce0e47 | 818 | * Allocate folio without setting the avoid_reserve argument. |
| 88ce3fef ML |
819 | * There certainly are no reserves associated with the |
| 820 | * pseudo_vma. However, there could be shared mappings with | |
| 821 | * reserves for the file at the inode level. If we fallocate | |
| d0ce0e47 | 822 | * folios in these areas, we need to consume the reserves |
| 88ce3fef ML |
823 | * to keep reservation accounting consistent. |
| 824 | */ | |
| 30cef82b | 825 | folio = alloc_hugetlb_folio(&pseudo_vma, addr, false); |
| d0ce0e47 | 826 | if (IS_ERR(folio)) { |
| 70c3547e | 827 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
| d0ce0e47 | 828 | error = PTR_ERR(folio); |
| 70c3547e MK |
829 | goto out; |
| 830 | } | |
| 8aca2bc9 | 831 | folio_zero_user(folio, addr); |
| d0ce0e47 | 832 | __folio_mark_uptodate(folio); |
| 9b91c0e2 | 833 | error = hugetlb_add_to_page_cache(folio, mapping, index); |
| 70c3547e | 834 | if (unlikely(error)) { |
| d2d7bb44 | 835 | restore_reserve_on_error(h, &pseudo_vma, addr, folio); |
| d0ce0e47 | 836 | folio_put(folio); |
| 70c3547e MK |
837 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); |
| 838 | goto out; | |
| 839 | } | |
| 840 | ||
| 841 | mutex_unlock(&hugetlb_fault_mutex_table[hash]); | |
| 842 | ||
| d0ce0e47 | 843 | folio_set_hugetlb_migratable(folio); |
| 70c3547e | 844 | /* |
| d0ce0e47 SK |
845 | * folio_unlock because locked by hugetlb_add_to_page_cache() |
| 846 | * folio_put() due to reference from alloc_hugetlb_folio() | |
| 70c3547e | 847 | */ |
| d0ce0e47 SK |
848 | folio_unlock(folio); |
| 849 | folio_put(folio); | |
| 70c3547e MK |
850 | } |
| 851 | ||
| 852 | if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) | |
| 853 | i_size_write(inode, offset + len); | |
| a72a7dea | 854 | inode_set_ctime_current(inode); |
| 70c3547e | 855 | out: |
| 5955102c | 856 | inode_unlock(inode); |
| 014ad7c4 HL |
857 | |
| 858 | out_nolock: | |
| 859 | trace_hugetlbfs_fallocate(inode, mode, offset, len, error); | |
| 70c3547e MK |
860 | return error; |
| 861 | } | |
| 862 | ||
| c1632a0f | 863 | static int hugetlbfs_setattr(struct mnt_idmap *idmap, |
| 549c7297 | 864 | struct dentry *dentry, struct iattr *attr) |
| 1da177e4 | 865 | { |
| 2b0143b5 | 866 | struct inode *inode = d_inode(dentry); |
| a5516438 | 867 | struct hstate *h = hstate_inode(inode); |
| 1da177e4 LT |
868 | int error; |
| 869 | unsigned int ia_valid = attr->ia_valid; | |
| ff62a342 | 870 | struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); |
| 1da177e4 | 871 | |
| 91e78a1e | 872 | error = setattr_prepare(idmap, dentry, attr); |
| 1da177e4 | 873 | if (error) |
| 1025774c | 874 | return error; |
| 1da177e4 | 875 | |
| 014ad7c4 HL |
876 | trace_hugetlbfs_setattr(inode, dentry, attr); |
| 877 | ||
| 1da177e4 | 878 | if (ia_valid & ATTR_SIZE) { |
| ff62a342 MAL |
879 | loff_t oldsize = inode->i_size; |
| 880 | loff_t newsize = attr->ia_size; | |
| 881 | ||
| 882 | if (newsize & ~huge_page_mask(h)) | |
| 1025774c | 883 | return -EINVAL; |
| 398c0da7 | 884 | /* protected by i_rwsem */ |
| ff62a342 MAL |
885 | if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || |
| 886 | (newsize > oldsize && (info->seals & F_SEAL_GROW))) | |
| 887 | return -EPERM; | |
| e5d319de | 888 | hugetlb_vmtruncate(inode, newsize); |
| 1da177e4 | 889 | } |
| 1025774c | 890 | |
| 91e78a1e | 891 | setattr_copy(idmap, inode, attr); |
| 1025774c CH |
892 | mark_inode_dirty(inode); |
| 893 | return 0; | |
| 1da177e4 LT |
894 | } |
| 895 | ||
| 7d54fa64 | 896 | static struct inode *hugetlbfs_get_root(struct super_block *sb, |
| 32021982 | 897 | struct hugetlbfs_fs_context *ctx) |
| 1da177e4 LT |
898 | { |
| 899 | struct inode *inode; | |
| 1da177e4 LT |
900 | |
| 901 | inode = new_inode(sb); | |
| 902 | if (inode) { | |
| 85fe4025 | 903 | inode->i_ino = get_next_ino(); |
| 32021982 DH |
904 | inode->i_mode = S_IFDIR | ctx->mode; |
| 905 | inode->i_uid = ctx->uid; | |
| 906 | inode->i_gid = ctx->gid; | |
| cfd87e76 | 907 | simple_inode_init_ts(inode); |
| 7d54fa64 AV |
908 | inode->i_op = &hugetlbfs_dir_inode_operations; |
| 909 | inode->i_fop = &simple_dir_operations; | |
| 910 | /* directory inodes start off with i_nlink == 2 (for "." entry) */ | |
| 911 | inc_nlink(inode); | |
| 65ed7601 | 912 | lockdep_annotate_inode_mutex_key(inode); |
| 7d54fa64 AV |
913 | } |
| 914 | return inode; | |
| 915 | } | |
| 916 | ||
| b610ded7 | 917 | /* |
| c8c06efa | 918 | * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never |
| b610ded7 | 919 | * be taken from reclaim -- unlike regular filesystems. This needs an |
| 88f306b6 | 920 | * annotation because huge_pmd_share() does an allocation under hugetlb's |
| c8c06efa | 921 | * i_mmap_rwsem. |
| b610ded7 | 922 | */ |
| c8c06efa | 923 | static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; |
| b610ded7 | 924 | |
| 7d54fa64 | 925 | static struct inode *hugetlbfs_get_inode(struct super_block *sb, |
| 91e78a1e | 926 | struct mnt_idmap *idmap, |
| 7d54fa64 | 927 | struct inode *dir, |
| 18df2252 | 928 | umode_t mode, dev_t dev) |
| 7d54fa64 AV |
929 | { |
| 930 | struct inode *inode; | |
| 58b6e5e8 | 931 | struct resv_map *resv_map = NULL; |
| 9119a41e | 932 | |
| 58b6e5e8 MK |
933 | /* |
| 934 | * Reserve maps are only needed for inodes that can have associated | |
| 935 | * page allocations. | |
| 936 | */ | |
| 937 | if (S_ISREG(mode) || S_ISLNK(mode)) { | |
| 938 | resv_map = resv_map_alloc(); | |
| 939 | if (!resv_map) | |
| 940 | return NULL; | |
| 941 | } | |
| 7d54fa64 AV |
942 | |
| 943 | inode = new_inode(sb); | |
| 944 | if (inode) { | |
| ff62a342 MAL |
945 | struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); |
| 946 | ||
| 7d54fa64 | 947 | inode->i_ino = get_next_ino(); |
| 91e78a1e | 948 | inode_init_owner(idmap, inode, dir, mode); |
| c8c06efa DB |
949 | lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, |
| 950 | &hugetlbfs_i_mmap_rwsem_key); | |
| 1da177e4 | 951 | inode->i_mapping->a_ops = &hugetlbfs_aops; |
| cfd87e76 | 952 | simple_inode_init_ts(inode); |
| 600f111e | 953 | inode->i_mapping->i_private_data = resv_map; |
| ff62a342 | 954 | info->seals = F_SEAL_SEAL; |
| 1da177e4 LT |
955 | switch (mode & S_IFMT) { |
| 956 | default: | |
| 957 | init_special_inode(inode, mode, dev); | |
| 958 | break; | |
| 959 | case S_IFREG: | |
| 960 | inode->i_op = &hugetlbfs_inode_operations; | |
| 961 | inode->i_fop = &hugetlbfs_file_operations; | |
| 962 | break; | |
| 963 | case S_IFDIR: | |
| 964 | inode->i_op = &hugetlbfs_dir_inode_operations; | |
| 965 | inode->i_fop = &simple_dir_operations; | |
| 966 | ||
| 967 | /* directory inodes start off with i_nlink == 2 (for "." entry) */ | |
| d8c76e6f | 968 | inc_nlink(inode); |
| 1da177e4 LT |
969 | break; |
| 970 | case S_IFLNK: | |
| 971 | inode->i_op = &page_symlink_inode_operations; | |
| 21fc61c7 | 972 | inode_nohighmem(inode); |
| 1da177e4 LT |
973 | break; |
| 974 | } | |
| e096d0c7 | 975 | lockdep_annotate_inode_mutex_key(inode); |
| 014ad7c4 | 976 | trace_hugetlbfs_alloc_inode(inode, dir, mode); |
| 58b6e5e8 MK |
977 | } else { |
| 978 | if (resv_map) | |
| 979 | kref_put(&resv_map->refs, resv_map_release); | |
| 980 | } | |
| 9119a41e | 981 | |
| 1da177e4 LT |
982 | return inode; |
| 983 | } | |
| 984 | ||
| 985 | /* | |
| 986 | * File creation. Allocate an inode, and we're done.. | |
| 987 | */ | |
| 5ebb29be | 988 | static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, |
| 19ee5345 | 989 | struct dentry *dentry, umode_t mode, dev_t dev) |
| 1da177e4 LT |
990 | { |
| 991 | struct inode *inode; | |
| 7d54fa64 | 992 | |
| 91e78a1e | 993 | inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev); |
| 19ee5345 AV |
994 | if (!inode) |
| 995 | return -ENOSPC; | |
| cfd87e76 | 996 | inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); |
| 50889f53 | 997 | d_make_persistent(dentry, inode); |
| 19ee5345 | 998 | return 0; |
| 1ab5b82f PS |
999 | } |
| 1000 | ||
| 88d5baf6 N |
1001 | static struct dentry *hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, |
| 1002 | struct dentry *dentry, umode_t mode) | |
| 1da177e4 | 1003 | { |
| 91e78a1e | 1004 | int retval = hugetlbfs_mknod(idmap, dir, dentry, |
| 549c7297 | 1005 | mode | S_IFDIR, 0); |
| 1da177e4 | 1006 | if (!retval) |
| d8c76e6f | 1007 | inc_nlink(dir); |
| 88d5baf6 | 1008 | return ERR_PTR(retval); |
| 1da177e4 LT |
1009 | } |
| 1010 | ||
| 6c960e68 | 1011 | static int hugetlbfs_create(struct mnt_idmap *idmap, |
| 549c7297 CB |
1012 | struct inode *dir, struct dentry *dentry, |
| 1013 | umode_t mode, bool excl) | |
| 1da177e4 | 1014 | { |
| 91e78a1e | 1015 | return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0); |
| 1da177e4 LT |
1016 | } |
| 1017 | ||
| 011e2b71 | 1018 | static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, |
| 863f144f | 1019 | struct inode *dir, struct file *file, |
| 549c7297 | 1020 | umode_t mode) |
| 1ab5b82f | 1021 | { |
| 19ee5345 AV |
1022 | struct inode *inode; |
| 1023 | ||
| 91e78a1e | 1024 | inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0); |
| 19ee5345 AV |
1025 | if (!inode) |
| 1026 | return -ENOSPC; | |
| cfd87e76 | 1027 | inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); |
| 863f144f MS |
1028 | d_tmpfile(file, inode); |
| 1029 | return finish_open_simple(file, 0); | |
| 1ab5b82f PS |
1030 | } |
| 1031 | ||
| 7a77db95 | 1032 | static int hugetlbfs_symlink(struct mnt_idmap *idmap, |
| 549c7297 CB |
1033 | struct inode *dir, struct dentry *dentry, |
| 1034 | const char *symname) | |
| 1da177e4 | 1035 | { |
| 91e78a1e | 1036 | const umode_t mode = S_IFLNK|S_IRWXUGO; |
| 1da177e4 LT |
1037 | struct inode *inode; |
| 1038 | int error = -ENOSPC; | |
| 1da177e4 | 1039 | |
| 91e78a1e | 1040 | inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0); |
| 1da177e4 LT |
1041 | if (inode) { |
| 1042 | int l = strlen(symname)+1; | |
| 1043 | error = page_symlink(inode, symname, l); | |
| 50889f53 AV |
1044 | if (!error) |
| 1045 | d_make_persistent(dentry, inode); | |
| 1046 | else | |
| 1da177e4 LT |
1047 | iput(inode); |
| 1048 | } | |
| cfd87e76 | 1049 | inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); |
| 1da177e4 LT |
1050 | |
| 1051 | return error; | |
| 1052 | } | |
| 1053 | ||
| b890ec2a MWO |
1054 | #ifdef CONFIG_MIGRATION |
| 1055 | static int hugetlbfs_migrate_folio(struct address_space *mapping, | |
| 1056 | struct folio *dst, struct folio *src, | |
| a6bc32b8 | 1057 | enum migrate_mode mode) |
| 290408d4 NH |
1058 | { |
| 1059 | int rc; | |
| 1060 | ||
| b890ec2a | 1061 | rc = migrate_huge_page_move_mapping(mapping, dst, src); |
| fb49a442 | 1062 | if (rc) |
| 290408d4 | 1063 | return rc; |
| cb6acd01 | 1064 | |
| 149562f7 SK |
1065 | if (hugetlb_folio_subpool(src)) { |
| 1066 | hugetlb_set_folio_subpool(dst, | |
| 1067 | hugetlb_folio_subpool(src)); | |
| 1068 | hugetlb_set_folio_subpool(src, NULL); | |
| cb6acd01 MK |
1069 | } |
| 1070 | ||
| f00b295b | 1071 | folio_migrate_flags(dst, src); |
| 290408d4 | 1072 | |
| fb49a442 | 1073 | return 0; |
| 290408d4 | 1074 | } |
| b890ec2a MWO |
1075 | #else |
| 1076 | #define hugetlbfs_migrate_folio NULL | |
| 1077 | #endif | |
| 290408d4 | 1078 | |
| af7628d6 MWO |
1079 | static int hugetlbfs_error_remove_folio(struct address_space *mapping, |
| 1080 | struct folio *folio) | |
| 78bb9203 | 1081 | { |
| 78bb9203 NH |
1082 | return 0; |
| 1083 | } | |
| 1084 | ||
| 4a25220d DH |
1085 | /* |
| 1086 | * Display the mount options in /proc/mounts. | |
| 1087 | */ | |
| 1088 | static int hugetlbfs_show_options(struct seq_file *m, struct dentry *root) | |
| 1089 | { | |
| 1090 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb); | |
| 1091 | struct hugepage_subpool *spool = sbinfo->spool; | |
| 1092 | unsigned long hpage_size = huge_page_size(sbinfo->hstate); | |
| 1093 | unsigned hpage_shift = huge_page_shift(sbinfo->hstate); | |
| 1094 | char mod; | |
| 1095 | ||
| 1096 | if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID)) | |
| 1097 | seq_printf(m, ",uid=%u", | |
| 1098 | from_kuid_munged(&init_user_ns, sbinfo->uid)); | |
| 1099 | if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) | |
| 1100 | seq_printf(m, ",gid=%u", | |
| 1101 | from_kgid_munged(&init_user_ns, sbinfo->gid)); | |
| 1102 | if (sbinfo->mode != 0755) | |
| 1103 | seq_printf(m, ",mode=%o", sbinfo->mode); | |
| 1104 | if (sbinfo->max_inodes != -1) | |
| 1105 | seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes); | |
| 1106 | ||
| 1107 | hpage_size /= 1024; | |
| 1108 | mod = 'K'; | |
| 1109 | if (hpage_size >= 1024) { | |
| 1110 | hpage_size /= 1024; | |
| 1111 | mod = 'M'; | |
| 1112 | } | |
| 1113 | seq_printf(m, ",pagesize=%lu%c", hpage_size, mod); | |
| 1114 | if (spool) { | |
| 1115 | if (spool->max_hpages != -1) | |
| 1116 | seq_printf(m, ",size=%llu", | |
| 1117 | (unsigned long long)spool->max_hpages << hpage_shift); | |
| 1118 | if (spool->min_hpages != -1) | |
| 1119 | seq_printf(m, ",min_size=%llu", | |
| 1120 | (unsigned long long)spool->min_hpages << hpage_shift); | |
| 1121 | } | |
| 1122 | return 0; | |
| 1123 | } | |
| 1124 | ||
| 726c3342 | 1125 | static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) |
| 1da177e4 | 1126 | { |
| 726c3342 | 1127 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); |
| 2b0143b5 | 1128 | struct hstate *h = hstate_inode(d_inode(dentry)); |
| ae62bcb5 | 1129 | u64 id = huge_encode_dev(dentry->d_sb->s_dev); |
| 1da177e4 | 1130 | |
| ae62bcb5 | 1131 | buf->f_fsid = u64_to_fsid(id); |
| 1da177e4 | 1132 | buf->f_type = HUGETLBFS_MAGIC; |
| a5516438 | 1133 | buf->f_bsize = huge_page_size(h); |
| 1da177e4 LT |
1134 | if (sbinfo) { |
| 1135 | spin_lock(&sbinfo->stat_lock); | |
| 11680763 | 1136 | /* If no limits set, just report 0 or -1 for max/free/used |
| 74a8a65c | 1137 | * blocks, like simple_statfs() */ |
| 90481622 DG |
1138 | if (sbinfo->spool) { |
| 1139 | long free_pages; | |
| 1140 | ||
| 4b25f030 | 1141 | spin_lock_irq(&sbinfo->spool->lock); |
| 90481622 DG |
1142 | buf->f_blocks = sbinfo->spool->max_hpages; |
| 1143 | free_pages = sbinfo->spool->max_hpages | |
| 1144 | - sbinfo->spool->used_hpages; | |
| 1145 | buf->f_bavail = buf->f_bfree = free_pages; | |
| 4b25f030 | 1146 | spin_unlock_irq(&sbinfo->spool->lock); |
| 74a8a65c DG |
1147 | buf->f_files = sbinfo->max_inodes; |
| 1148 | buf->f_ffree = sbinfo->free_inodes; | |
| 1149 | } | |
| 1da177e4 LT |
1150 | spin_unlock(&sbinfo->stat_lock); |
| 1151 | } | |
| 1152 | buf->f_namelen = NAME_MAX; | |
| 1153 | return 0; | |
| 1154 | } | |
| 1155 | ||
| 1156 | static void hugetlbfs_put_super(struct super_block *sb) | |
| 1157 | { | |
| 1158 | struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb); | |
| 1159 | ||
| 1160 | if (sbi) { | |
| 1161 | sb->s_fs_info = NULL; | |
| 90481622 DG |
1162 | |
| 1163 | if (sbi->spool) | |
| 1164 | hugepage_put_subpool(sbi->spool); | |
| 1165 | ||
| 1da177e4 LT |
1166 | kfree(sbi); |
| 1167 | } | |
| 1168 | } | |
| 1169 | ||
| 96527980 CH |
1170 | static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo) |
| 1171 | { | |
| 1172 | if (sbinfo->free_inodes >= 0) { | |
| 1173 | spin_lock(&sbinfo->stat_lock); | |
| 1174 | if (unlikely(!sbinfo->free_inodes)) { | |
| 1175 | spin_unlock(&sbinfo->stat_lock); | |
| 1176 | return 0; | |
| 1177 | } | |
| 1178 | sbinfo->free_inodes--; | |
| 1179 | spin_unlock(&sbinfo->stat_lock); | |
| 1180 | } | |
| 1181 | ||
| 1182 | return 1; | |
| 1183 | } | |
| 1184 | ||
| 1185 | static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo) | |
| 1186 | { | |
| 1187 | if (sbinfo->free_inodes >= 0) { | |
| 1188 | spin_lock(&sbinfo->stat_lock); | |
| 1189 | sbinfo->free_inodes++; | |
| 1190 | spin_unlock(&sbinfo->stat_lock); | |
| 1191 | } | |
| 1192 | } | |
| 1193 | ||
| 1194 | ||
| e18b890b | 1195 | static struct kmem_cache *hugetlbfs_inode_cachep; |
| 1da177e4 LT |
1196 | |
| 1197 | static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) | |
| 1198 | { | |
| 96527980 | 1199 | struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); |
| 1da177e4 LT |
1200 | struct hugetlbfs_inode_info *p; |
| 1201 | ||
| 96527980 CH |
1202 | if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo))) |
| 1203 | return NULL; | |
| fd60b288 | 1204 | p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL); |
| 96527980 CH |
1205 | if (unlikely(!p)) { |
| 1206 | hugetlbfs_inc_free_inodes(sbinfo); | |
| 1da177e4 | 1207 | return NULL; |
| 96527980 | 1208 | } |
| 1da177e4 LT |
1209 | return &p->vfs_inode; |
| 1210 | } | |
| 1211 | ||
| b62de322 | 1212 | static void hugetlbfs_free_inode(struct inode *inode) |
| fa0d7e3d | 1213 | { |
| 014ad7c4 | 1214 | trace_hugetlbfs_free_inode(inode); |
| fa0d7e3d NP |
1215 | kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); |
| 1216 | } | |
| 1217 | ||
| 1da177e4 LT |
1218 | static void hugetlbfs_destroy_inode(struct inode *inode) |
| 1219 | { | |
| 96527980 | 1220 | hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); |
| 1da177e4 LT |
1221 | } |
| 1222 | ||
| f5e54d6e | 1223 | static const struct address_space_operations hugetlbfs_aops = { |
| 800d15a5 NP |
1224 | .write_begin = hugetlbfs_write_begin, |
| 1225 | .write_end = hugetlbfs_write_end, | |
| 46de8b97 | 1226 | .dirty_folio = noop_dirty_folio, |
| b890ec2a | 1227 | .migrate_folio = hugetlbfs_migrate_folio, |
| af7628d6 | 1228 | .error_remove_folio = hugetlbfs_error_remove_folio, |
| 1da177e4 LT |
1229 | }; |
| 1230 | ||
| 96527980 | 1231 | |
| 51cc5068 | 1232 | static void init_once(void *foo) |
| 96527980 | 1233 | { |
| dbaf7dc9 | 1234 | struct hugetlbfs_inode_info *ei = foo; |
| 96527980 | 1235 | |
| a35afb83 | 1236 | inode_init_once(&ei->vfs_inode); |
| 96527980 CH |
1237 | } |
| 1238 | ||
| 886b94d2 | 1239 | static const struct file_operations hugetlbfs_file_operations = { |
| 34d0640e | 1240 | .read_iter = hugetlbfs_read_iter, |
| ea52cb24 | 1241 | .mmap_prepare = hugetlbfs_file_mmap_prepare, |
| 1b061d92 | 1242 | .fsync = noop_fsync, |
| 1da177e4 | 1243 | .get_unmapped_area = hugetlb_get_unmapped_area, |
| 70c3547e MK |
1244 | .llseek = default_llseek, |
| 1245 | .fallocate = hugetlbfs_fallocate, | |
| 886b94d2 | 1246 | .fop_flags = FOP_HUGE_PAGES, |
| 1da177e4 LT |
1247 | }; |
| 1248 | ||
| 92e1d5be | 1249 | static const struct inode_operations hugetlbfs_dir_inode_operations = { |
| 1da177e4 LT |
1250 | .create = hugetlbfs_create, |
| 1251 | .lookup = simple_lookup, | |
| 1252 | .link = simple_link, | |
| 1253 | .unlink = simple_unlink, | |
| 1254 | .symlink = hugetlbfs_symlink, | |
| 1255 | .mkdir = hugetlbfs_mkdir, | |
| 1256 | .rmdir = simple_rmdir, | |
| 1257 | .mknod = hugetlbfs_mknod, | |
| 1258 | .rename = simple_rename, | |
| 1259 | .setattr = hugetlbfs_setattr, | |
| 1ab5b82f | 1260 | .tmpfile = hugetlbfs_tmpfile, |
| 1da177e4 LT |
1261 | }; |
| 1262 | ||
| 92e1d5be | 1263 | static const struct inode_operations hugetlbfs_inode_operations = { |
| 1da177e4 LT |
1264 | .setattr = hugetlbfs_setattr, |
| 1265 | }; | |
| 1266 | ||
| ee9b6d61 | 1267 | static const struct super_operations hugetlbfs_ops = { |
| 1da177e4 | 1268 | .alloc_inode = hugetlbfs_alloc_inode, |
| b62de322 | 1269 | .free_inode = hugetlbfs_free_inode, |
| 1da177e4 | 1270 | .destroy_inode = hugetlbfs_destroy_inode, |
| 2bbbda30 | 1271 | .evict_inode = hugetlbfs_evict_inode, |
| 1da177e4 | 1272 | .statfs = hugetlbfs_statfs, |
| 1da177e4 | 1273 | .put_super = hugetlbfs_put_super, |
| 4a25220d | 1274 | .show_options = hugetlbfs_show_options, |
| 1da177e4 LT |
1275 | }; |
| 1276 | ||
| 7ca02d0a MK |
1277 | /* |
| 1278 | * Convert size option passed from command line to number of huge pages | |
| 1279 | * in the pool specified by hstate. Size option could be in bytes | |
| 1280 | * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). | |
| 1281 | */ | |
| 4a25220d | 1282 | static long |
| 7ca02d0a | 1283 | hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, |
| 4a25220d | 1284 | enum hugetlbfs_size_type val_type) |
| 7ca02d0a MK |
1285 | { |
| 1286 | if (val_type == NO_SIZE) | |
| 1287 | return -1; | |
| 1288 | ||
| 1289 | if (val_type == SIZE_PERCENT) { | |
| 1290 | size_opt <<= huge_page_shift(h); | |
| 1291 | size_opt *= h->max_huge_pages; | |
| 1292 | do_div(size_opt, 100); | |
| 1293 | } | |
| 1294 | ||
| 1295 | size_opt >>= huge_page_shift(h); | |
| 1296 | return size_opt; | |
| 1297 | } | |
| 1298 | ||
| 32021982 DH |
1299 | /* |
| 1300 | * Parse one mount parameter. | |
| 1301 | */ | |
| 1302 | static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param) | |
| 1da177e4 | 1303 | { |
| 32021982 DH |
1304 | struct hugetlbfs_fs_context *ctx = fc->fs_private; |
| 1305 | struct fs_parse_result result; | |
| 79d72c68 | 1306 | struct hstate *h; |
| 32021982 DH |
1307 | char *rest; |
| 1308 | unsigned long ps; | |
| 1309 | int opt; | |
| 1310 | ||
| d7167b14 | 1311 | opt = fs_parse(fc, hugetlb_fs_parameters, param, &result); |
| 32021982 DH |
1312 | if (opt < 0) |
| 1313 | return opt; | |
| 1314 | ||
| 1315 | switch (opt) { | |
| 1316 | case Opt_uid: | |
| eefc1324 | 1317 | ctx->uid = result.uid; |
| 1da177e4 | 1318 | return 0; |
| 1da177e4 | 1319 | |
| 32021982 | 1320 | case Opt_gid: |
| eefc1324 | 1321 | ctx->gid = result.gid; |
| 32021982 | 1322 | return 0; |
| e73a75fa | 1323 | |
| 32021982 DH |
1324 | case Opt_mode: |
| 1325 | ctx->mode = result.uint_32 & 01777U; | |
| 1326 | return 0; | |
| e73a75fa | 1327 | |
| 32021982 DH |
1328 | case Opt_size: |
| 1329 | /* memparse() will accept a K/M/G without a digit */ | |
| 26215b7e | 1330 | if (!param->string || !isdigit(param->string[0])) |
| 32021982 DH |
1331 | goto bad_val; |
| 1332 | ctx->max_size_opt = memparse(param->string, &rest); | |
| 1333 | ctx->max_val_type = SIZE_STD; | |
| 1334 | if (*rest == '%') | |
| 1335 | ctx->max_val_type = SIZE_PERCENT; | |
| 1336 | return 0; | |
| e73a75fa | 1337 | |
| 32021982 DH |
1338 | case Opt_nr_inodes: |
| 1339 | /* memparse() will accept a K/M/G without a digit */ | |
| 26215b7e | 1340 | if (!param->string || !isdigit(param->string[0])) |
| 32021982 DH |
1341 | goto bad_val; |
| 1342 | ctx->nr_inodes = memparse(param->string, &rest); | |
| 1343 | return 0; | |
| e73a75fa | 1344 | |
| 32021982 DH |
1345 | case Opt_pagesize: |
| 1346 | ps = memparse(param->string, &rest); | |
| 79d72c68 OS |
1347 | h = size_to_hstate(ps); |
| 1348 | if (!h) { | |
| d0036517 | 1349 | pr_err("Unsupported page size %lu MB\n", ps / SZ_1M); |
| 32021982 | 1350 | return -EINVAL; |
| e73a75fa | 1351 | } |
| 79d72c68 | 1352 | ctx->hstate = h; |
| 32021982 | 1353 | return 0; |
| 1da177e4 | 1354 | |
| 32021982 DH |
1355 | case Opt_min_size: |
| 1356 | /* memparse() will accept a K/M/G without a digit */ | |
| 26215b7e | 1357 | if (!param->string || !isdigit(param->string[0])) |
| 32021982 DH |
1358 | goto bad_val; |
| 1359 | ctx->min_size_opt = memparse(param->string, &rest); | |
| 1360 | ctx->min_val_type = SIZE_STD; | |
| 1361 | if (*rest == '%') | |
| 1362 | ctx->min_val_type = SIZE_PERCENT; | |
| 1363 | return 0; | |
| e73a75fa | 1364 | |
| 32021982 DH |
1365 | default: |
| 1366 | return -EINVAL; | |
| 1367 | } | |
| a137e1cc | 1368 | |
| 32021982 | 1369 | bad_val: |
| b5db30cf | 1370 | return invalfc(fc, "Bad value '%s' for mount option '%s'\n", |
| 32021982 DH |
1371 | param->string, param->key); |
| 1372 | } | |
| 7ca02d0a | 1373 | |
| 32021982 DH |
1374 | /* |
| 1375 | * Validate the parsed options. | |
| 1376 | */ | |
| 1377 | static int hugetlbfs_validate(struct fs_context *fc) | |
| 1378 | { | |
| 1379 | struct hugetlbfs_fs_context *ctx = fc->fs_private; | |
| a137e1cc | 1380 | |
| 7ca02d0a MK |
1381 | /* |
| 1382 | * Use huge page pool size (in hstate) to convert the size | |
| 1383 | * options to number of huge pages. If NO_SIZE, -1 is returned. | |
| 1384 | */ | |
| 32021982 DH |
1385 | ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate, |
| 1386 | ctx->max_size_opt, | |
| 1387 | ctx->max_val_type); | |
| 1388 | ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate, | |
| 1389 | ctx->min_size_opt, | |
| 1390 | ctx->min_val_type); | |
| 7ca02d0a MK |
1391 | |
| 1392 | /* | |
| 1393 | * If max_size was specified, then min_size must be smaller | |
| 1394 | */ | |
| 32021982 DH |
1395 | if (ctx->max_val_type > NO_SIZE && |
| 1396 | ctx->min_hpages > ctx->max_hpages) { | |
| 1397 | pr_err("Minimum size can not be greater than maximum size\n"); | |
| 7ca02d0a | 1398 | return -EINVAL; |
| a137e1cc AK |
1399 | } |
| 1400 | ||
| 1da177e4 LT |
1401 | return 0; |
| 1402 | } | |
| 1403 | ||
| 1404 | static int | |
| 32021982 | 1405 | hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) |
| 1da177e4 | 1406 | { |
| 32021982 | 1407 | struct hugetlbfs_fs_context *ctx = fc->fs_private; |
| 1da177e4 LT |
1408 | struct hugetlbfs_sb_info *sbinfo; |
| 1409 | ||
| 1da177e4 LT |
1410 | sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL); |
| 1411 | if (!sbinfo) | |
| 1412 | return -ENOMEM; | |
| 1413 | sb->s_fs_info = sbinfo; | |
| 1414 | spin_lock_init(&sbinfo->stat_lock); | |
| 32021982 DH |
1415 | sbinfo->hstate = ctx->hstate; |
| 1416 | sbinfo->max_inodes = ctx->nr_inodes; | |
| 1417 | sbinfo->free_inodes = ctx->nr_inodes; | |
| 1418 | sbinfo->spool = NULL; | |
| 1419 | sbinfo->uid = ctx->uid; | |
| 1420 | sbinfo->gid = ctx->gid; | |
| 1421 | sbinfo->mode = ctx->mode; | |
| 4a25220d | 1422 | |
| 7ca02d0a MK |
1423 | /* |
| 1424 | * Allocate and initialize subpool if maximum or minimum size is | |
| 1935ebd3 | 1425 | * specified. Any needed reservations (for minimum size) are taken |
| 445c8098 | 1426 | * when the subpool is created. |
| 7ca02d0a | 1427 | */ |
| 32021982 DH |
1428 | if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { |
| 1429 | sbinfo->spool = hugepage_new_subpool(ctx->hstate, | |
| 1430 | ctx->max_hpages, | |
| 1431 | ctx->min_hpages); | |
| 90481622 DG |
1432 | if (!sbinfo->spool) |
| 1433 | goto out_free; | |
| 1434 | } | |
| 1da177e4 | 1435 | sb->s_maxbytes = MAX_LFS_FILESIZE; |
| 32021982 DH |
1436 | sb->s_blocksize = huge_page_size(ctx->hstate); |
| 1437 | sb->s_blocksize_bits = huge_page_shift(ctx->hstate); | |
| 1da177e4 LT |
1438 | sb->s_magic = HUGETLBFS_MAGIC; |
| 1439 | sb->s_op = &hugetlbfs_ops; | |
| 3333ed35 | 1440 | sb->s_d_flags = DCACHE_DONTCACHE; |
| 1da177e4 | 1441 | sb->s_time_gran = 1; |
| 15568299 MK |
1442 | |
| 1443 | /* | |
| 1444 | * Due to the special and limited functionality of hugetlbfs, it does | |
| 1445 | * not work well as a stacking filesystem. | |
| 1446 | */ | |
| 1447 | sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; | |
| 32021982 | 1448 | sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx)); |
| 48fde701 | 1449 | if (!sb->s_root) |
| 1da177e4 | 1450 | goto out_free; |
| 1da177e4 LT |
1451 | return 0; |
| 1452 | out_free: | |
| 6e6870d4 | 1453 | kfree(sbinfo->spool); |
| 1da177e4 LT |
1454 | kfree(sbinfo); |
| 1455 | return -ENOMEM; | |
| 1456 | } | |
| 1457 | ||
| 32021982 DH |
1458 | static int hugetlbfs_get_tree(struct fs_context *fc) |
| 1459 | { | |
| 1460 | int err = hugetlbfs_validate(fc); | |
| 1461 | if (err) | |
| 1462 | return err; | |
| 2ac295d4 | 1463 | return get_tree_nodev(fc, hugetlbfs_fill_super); |
| 32021982 DH |
1464 | } |
| 1465 | ||
| 1466 | static void hugetlbfs_fs_context_free(struct fs_context *fc) | |
| 1467 | { | |
| 1468 | kfree(fc->fs_private); | |
| 1469 | } | |
| 1470 | ||
| 1471 | static const struct fs_context_operations hugetlbfs_fs_context_ops = { | |
| 1472 | .free = hugetlbfs_fs_context_free, | |
| 1473 | .parse_param = hugetlbfs_parse_param, | |
| 1474 | .get_tree = hugetlbfs_get_tree, | |
| 1475 | }; | |
| 1476 | ||
| 1477 | static int hugetlbfs_init_fs_context(struct fs_context *fc) | |
| 1da177e4 | 1478 | { |
| 32021982 DH |
1479 | struct hugetlbfs_fs_context *ctx; |
| 1480 | ||
| 1481 | ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL); | |
| 1482 | if (!ctx) | |
| 1483 | return -ENOMEM; | |
| 1484 | ||
| 1485 | ctx->max_hpages = -1; /* No limit on size by default */ | |
| 1486 | ctx->nr_inodes = -1; /* No limit on number of inodes by default */ | |
| 1487 | ctx->uid = current_fsuid(); | |
| 1488 | ctx->gid = current_fsgid(); | |
| 1489 | ctx->mode = 0755; | |
| 1490 | ctx->hstate = &default_hstate; | |
| 1491 | ctx->min_hpages = -1; /* No default minimum size */ | |
| 1492 | ctx->max_val_type = NO_SIZE; | |
| 1493 | ctx->min_val_type = NO_SIZE; | |
| 1494 | fc->fs_private = ctx; | |
| 1495 | fc->ops = &hugetlbfs_fs_context_ops; | |
| 1496 | return 0; | |
| 1da177e4 LT |
1497 | } |
| 1498 | ||
| 1499 | static struct file_system_type hugetlbfs_fs_type = { | |
| 32021982 DH |
1500 | .name = "hugetlbfs", |
| 1501 | .init_fs_context = hugetlbfs_init_fs_context, | |
| d7167b14 | 1502 | .parameters = hugetlb_fs_parameters, |
| 50889f53 | 1503 | .kill_sb = kill_anon_super, |
| 91e78a1e | 1504 | .fs_flags = FS_ALLOW_IDMAP, |
| 1da177e4 LT |
1505 | }; |
| 1506 | ||
| 42d7395f | 1507 | static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; |
| 1da177e4 | 1508 | |
| ef1ff6b8 | 1509 | static int can_do_hugetlb_shm(void) |
| 1da177e4 | 1510 | { |
| a0eb3a05 EB |
1511 | kgid_t shm_group; |
| 1512 | shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group); | |
| 1513 | return capable(CAP_IPC_LOCK) || in_group_p(shm_group); | |
| 1da177e4 LT |
1514 | } |
| 1515 | ||
| 42d7395f AK |
1516 | static int get_hstate_idx(int page_size_log) |
| 1517 | { | |
| af73e4d9 | 1518 | struct hstate *h = hstate_sizelog(page_size_log); |
| 42d7395f | 1519 | |
| 42d7395f AK |
1520 | if (!h) |
| 1521 | return -1; | |
| 04adbc3f | 1522 | return hstate_index(h); |
| 42d7395f AK |
1523 | } |
| 1524 | ||
| af73e4d9 NH |
1525 | /* |
| 1526 | * Note that size should be aligned to proper hugepage size in caller side, | |
| 1527 | * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. | |
| 1528 | */ | |
| 1529 | struct file *hugetlb_file_setup(const char *name, size_t size, | |
| 83c1fd76 | 1530 | vm_flags_t acctflag, int creat_flags, |
| 1531 | int page_size_log) | |
| 1da177e4 | 1532 | { |
| 1da177e4 | 1533 | struct inode *inode; |
| e68375c8 | 1534 | struct vfsmount *mnt; |
| 42d7395f | 1535 | int hstate_idx; |
| e68375c8 | 1536 | struct file *file; |
| 42d7395f AK |
1537 | |
| 1538 | hstate_idx = get_hstate_idx(page_size_log); | |
| 1539 | if (hstate_idx < 0) | |
| 1540 | return ERR_PTR(-ENODEV); | |
| 1da177e4 | 1541 | |
| e68375c8 AV |
1542 | mnt = hugetlbfs_vfsmount[hstate_idx]; |
| 1543 | if (!mnt) | |
| 5bc98594 AM |
1544 | return ERR_PTR(-ENOENT); |
| 1545 | ||
| ef1ff6b8 | 1546 | if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { |
| 83c1fd76 | 1547 | struct ucounts *ucounts = current_ucounts(); |
| 1548 | ||
| 1549 | if (user_shm_lock(size, ucounts)) { | |
| 1550 | pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n", | |
| 21a3c273 | 1551 | current->comm, current->pid); |
| 83c1fd76 | 1552 | user_shm_unlock(size, ucounts); |
| 353d5c30 | 1553 | } |
| 83c1fd76 | 1554 | return ERR_PTR(-EPERM); |
| 2584e517 | 1555 | } |
| 1da177e4 | 1556 | |
| 39b65252 | 1557 | file = ERR_PTR(-ENOSPC); |
| 91e78a1e GS |
1558 | /* hugetlbfs_vfsmount[] mounts do not use idmapped mounts. */ |
| 1559 | inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL, | |
| 1560 | S_IFREG | S_IRWXUGO, 0); | |
| 1da177e4 | 1561 | if (!inode) |
| e68375c8 | 1562 | goto out; |
| e1832f29 SS |
1563 | if (creat_flags == HUGETLB_SHMFS_INODE) |
| 1564 | inode->i_flags |= S_PRIVATE; | |
| 1da177e4 | 1565 | |
| 1da177e4 | 1566 | inode->i_size = size; |
| 6d6b77f1 | 1567 | clear_nlink(inode); |
| ce8d2cdf | 1568 | |
| 986f5f2b | 1569 | if (hugetlb_reserve_pages(inode, 0, |
| e68375c8 | 1570 | size >> huge_page_shift(hstate_inode(inode)), NULL, |
| 986f5f2b | 1571 | acctflag) < 0) |
| e68375c8 AV |
1572 | file = ERR_PTR(-ENOMEM); |
| 1573 | else | |
| 1574 | file = alloc_file_pseudo(inode, mnt, name, O_RDWR, | |
| 1575 | &hugetlbfs_file_operations); | |
| 1576 | if (!IS_ERR(file)) | |
| 1577 | return file; | |
| 1da177e4 | 1578 | |
| b45b5bd6 | 1579 | iput(inode); |
| e68375c8 | 1580 | out: |
| 39b65252 | 1581 | return file; |
| 1da177e4 LT |
1582 | } |
| 1583 | ||
| 32021982 DH |
1584 | static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) |
| 1585 | { | |
| 1586 | struct fs_context *fc; | |
| 1587 | struct vfsmount *mnt; | |
| 1588 | ||
| 1589 | fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT); | |
| 1590 | if (IS_ERR(fc)) { | |
| 1591 | mnt = ERR_CAST(fc); | |
| 1592 | } else { | |
| 1593 | struct hugetlbfs_fs_context *ctx = fc->fs_private; | |
| 1594 | ctx->hstate = h; | |
| 24368a74 | 1595 | mnt = fc_mount_longterm(fc); |
| 32021982 DH |
1596 | put_fs_context(fc); |
| 1597 | } | |
| 1598 | if (IS_ERR(mnt)) | |
| a25fddce | 1599 | pr_err("Cannot mount internal hugetlbfs for page size %luK", |
| d0036517 | 1600 | huge_page_size(h) / SZ_1K); |
| 32021982 DH |
1601 | return mnt; |
| 1602 | } | |
| 1603 | ||
| 1da177e4 LT |
1604 | static int __init init_hugetlbfs_fs(void) |
| 1605 | { | |
| 32021982 | 1606 | struct vfsmount *mnt; |
| 42d7395f | 1607 | struct hstate *h; |
| 1da177e4 | 1608 | int error; |
| 42d7395f | 1609 | int i; |
| 1da177e4 | 1610 | |
| 457c1b27 | 1611 | if (!hugepages_supported()) { |
| 9b857d26 | 1612 | pr_info("disabling because there are no supported hugepage sizes\n"); |
| 457c1b27 NA |
1613 | return -ENOTSUPP; |
| 1614 | } | |
| 1615 | ||
| d1d5e05f | 1616 | error = -ENOMEM; |
| 1da177e4 LT |
1617 | hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", |
| 1618 | sizeof(struct hugetlbfs_inode_info), | |
| 5d097056 | 1619 | 0, SLAB_ACCOUNT, init_once); |
| 1da177e4 | 1620 | if (hugetlbfs_inode_cachep == NULL) |
| 8fc312b3 | 1621 | goto out; |
| 1da177e4 LT |
1622 | |
| 1623 | error = register_filesystem(&hugetlbfs_fs_type); | |
| 1624 | if (error) | |
| 8fc312b3 | 1625 | goto out_free; |
| 1da177e4 | 1626 | |
| 8fc312b3 | 1627 | /* default hstate mount is required */ |
| 3b2275a8 | 1628 | mnt = mount_one_hugetlbfs(&default_hstate); |
| 8fc312b3 MK |
1629 | if (IS_ERR(mnt)) { |
| 1630 | error = PTR_ERR(mnt); | |
| 1631 | goto out_unreg; | |
| 1632 | } | |
| 1633 | hugetlbfs_vfsmount[default_hstate_idx] = mnt; | |
| 1634 | ||
| 1635 | /* other hstates are optional */ | |
| 42d7395f AK |
1636 | i = 0; |
| 1637 | for_each_hstate(h) { | |
| 15f0ec94 JS |
1638 | if (i == default_hstate_idx) { |
| 1639 | i++; | |
| 8fc312b3 | 1640 | continue; |
| 15f0ec94 | 1641 | } |
| 8fc312b3 | 1642 | |
| 32021982 | 1643 | mnt = mount_one_hugetlbfs(h); |
| 8fc312b3 MK |
1644 | if (IS_ERR(mnt)) |
| 1645 | hugetlbfs_vfsmount[i] = NULL; | |
| 1646 | else | |
| 1647 | hugetlbfs_vfsmount[i] = mnt; | |
| 42d7395f AK |
1648 | i++; |
| 1649 | } | |
| 32021982 DH |
1650 | |
| 1651 | return 0; | |
| 1da177e4 | 1652 | |
| 8fc312b3 MK |
1653 | out_unreg: |
| 1654 | (void)unregister_filesystem(&hugetlbfs_fs_type); | |
| 1655 | out_free: | |
| d1d5e05f | 1656 | kmem_cache_destroy(hugetlbfs_inode_cachep); |
| 8fc312b3 | 1657 | out: |
| 1da177e4 LT |
1658 | return error; |
| 1659 | } | |
| 3e89e1c5 | 1660 | fs_initcall(init_hugetlbfs_fs) |