]> git.ipfire.org Git - people/ms/linux.git/blame - arch/x86/kernel/cpu/sgx/encl.c
Merge tag 'x86_urgent_for_v6.0-rc8' of git://git.kernel.org/pub/scm/linux/kernel...
[people/ms/linux.git] / arch / x86 / kernel / cpu / sgx / encl.c
CommitLineData
3fe0778e
JS
1// SPDX-License-Identifier: GPL-2.0
2/* Copyright(c) 2016-20 Intel Corporation. */
3
4#include <linux/lockdep.h>
5#include <linux/mm.h>
6#include <linux/mman.h>
7#include <linux/shmem_fs.h>
8#include <linux/suspend.h>
9#include <linux/sched/mm.h>
8ca52cc3 10#include <asm/sgx.h>
3fe0778e
JS
11#include "encl.h"
12#include "encls.h"
13#include "sgx.h"
14
af117837
RC
15#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
16/*
17 * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
18 * determine the page index associated with the first PCMD entry
19 * within a PCMD page.
20 */
21#define PCMD_FIRST_MASK GENMASK(4, 0)
22
23/**
24 * reclaimer_writing_to_pcmd() - Query if any enclave page associated with
25 * a PCMD page is in process of being reclaimed.
26 * @encl: Enclave to which PCMD page belongs
27 * @start_addr: Address of enclave page using first entry within the PCMD page
28 *
29 * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is
30 * stored. The PCMD data of a reclaimed enclave page contains enough
31 * information for the processor to verify the page at the time
32 * it is loaded back into the Enclave Page Cache (EPC).
33 *
34 * The backing storage to which enclave pages are reclaimed is laid out as
35 * follows:
36 * Encrypted enclave pages:SECS page:PCMD pages
37 *
38 * Each PCMD page contains the PCMD metadata of
39 * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages.
40 *
41 * A PCMD page can only be truncated if it is (a) empty, and (b) not in the
42 * process of getting data (and thus soon being non-empty). (b) is tested with
43 * a check if an enclave page sharing the PCMD page is in the process of being
44 * reclaimed.
45 *
46 * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it
47 * intends to reclaim that enclave page - it means that the PCMD page
48 * associated with that enclave page is about to get some data and thus
49 * even if the PCMD page is empty, it should not be truncated.
50 *
51 * Context: Enclave mutex (&sgx_encl->lock) must be held.
52 * Return: 1 if the reclaimer is about to write to the PCMD page
53 * 0 if the reclaimer has no intention to write to the PCMD page
54 */
55static int reclaimer_writing_to_pcmd(struct sgx_encl *encl,
56 unsigned long start_addr)
57{
58 int reclaimed = 0;
59 int i;
60
61 /*
62 * PCMD_FIRST_MASK is based on number of PCMD entries within
63 * PCMD page being 32.
64 */
65 BUILD_BUG_ON(PCMDS_PER_PAGE != 32);
66
67 for (i = 0; i < PCMDS_PER_PAGE; i++) {
68 struct sgx_encl_page *entry;
69 unsigned long addr;
70
71 addr = start_addr + i * PAGE_SIZE;
72
73 /*
74 * Stop when reaching the SECS page - it does not
75 * have a page_array entry and its reclaim is
76 * started and completed with enclave mutex held so
77 * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED
78 * flag.
79 */
80 if (addr == encl->base + encl->size)
81 break;
82
83 entry = xa_load(&encl->page_array, PFN_DOWN(addr));
84 if (!entry)
85 continue;
86
87 /*
88 * VA page slot ID uses same bit as the flag so it is important
89 * to ensure that the page is not already in backing store.
90 */
91 if (entry->epc_page &&
92 (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) {
93 reclaimed = 1;
94 break;
95 }
96 }
97
98 return reclaimed;
99}
100
08999b24
JS
101/*
102 * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's
103 * follow right after the EPC data in the backing storage. In addition to the
104 * visible enclave pages, there's one extra page slot for SECS, before PCMD
105 * structs.
106 */
107static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl,
108 unsigned long page_index)
109{
110 pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs);
111
112 return epc_end_off + page_index * sizeof(struct sgx_pcmd);
113}
114
115/*
116 * Free a page from the backing storage in the given page index.
117 */
118static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index)
119{
120 struct inode *inode = file_inode(encl->backing);
121
122 shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1);
123}
124
1728ab54
JS
125/*
126 * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC
127 * Pages" in the SDM.
128 */
129static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
130 struct sgx_epc_page *epc_page,
131 struct sgx_epc_page *secs_page)
132{
133 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
134 struct sgx_encl *encl = encl_page->encl;
08999b24 135 pgoff_t page_index, page_pcmd_off;
af117837 136 unsigned long pcmd_first_page;
1728ab54
JS
137 struct sgx_pageinfo pginfo;
138 struct sgx_backing b;
08999b24
JS
139 bool pcmd_page_empty;
140 u8 *pcmd_page;
1728ab54
JS
141 int ret;
142
143 if (secs_page)
144 page_index = PFN_DOWN(encl_page->desc - encl_page->encl->base);
145 else
146 page_index = PFN_DOWN(encl->size);
147
af117837
RC
148 /*
149 * Address of enclave page using the first entry within the PCMD page.
150 */
151 pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base;
152
08999b24
JS
153 page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
154
0c9782e2 155 ret = sgx_encl_lookup_backing(encl, page_index, &b);
1728ab54
JS
156 if (ret)
157 return ret;
158
159 pginfo.addr = encl_page->desc & PAGE_MASK;
160 pginfo.contents = (unsigned long)kmap_atomic(b.contents);
08999b24
JS
161 pcmd_page = kmap_atomic(b.pcmd);
162 pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
1728ab54
JS
163
164 if (secs_page)
165 pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page);
166 else
167 pginfo.secs = 0;
168
169 ret = __eldu(&pginfo, sgx_get_epc_virt_addr(epc_page),
170 sgx_get_epc_virt_addr(encl_page->va_page->epc_page) + va_offset);
171 if (ret) {
172 if (encls_failed(ret))
173 ENCLS_WARN(ret, "ELDU");
174
175 ret = -EFAULT;
176 }
177
08999b24 178 memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd));
2154e1c1 179 set_page_dirty(b.pcmd);
08999b24
JS
180
181 /*
182 * The area for the PCMD in the page was zeroed above. Check if the
183 * whole page is now empty meaning that all PCMD's have been zeroed:
184 */
185 pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE);
186
187 kunmap_atomic(pcmd_page);
1728ab54
JS
188 kunmap_atomic((void *)(unsigned long)pginfo.contents);
189
e3a3bbe3 190 get_page(b.pcmd);
6bd42964 191 sgx_encl_put_backing(&b);
1728ab54 192
08999b24
JS
193 sgx_encl_truncate_backing_page(encl, page_index);
194
e3a3bbe3 195 if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) {
08999b24 196 sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
e3a3bbe3
RC
197 pcmd_page = kmap_atomic(b.pcmd);
198 if (memchr_inv(pcmd_page, 0, PAGE_SIZE))
199 pr_warn("PCMD page not empty after truncate.\n");
200 kunmap_atomic(pcmd_page);
201 }
202
203 put_page(b.pcmd);
08999b24 204
1728ab54
JS
205 return ret;
206}
207
208static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
209 struct sgx_epc_page *secs_page)
210{
211
212 unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK;
213 struct sgx_encl *encl = encl_page->encl;
214 struct sgx_epc_page *epc_page;
215 int ret;
216
217 epc_page = sgx_alloc_epc_page(encl_page, false);
218 if (IS_ERR(epc_page))
219 return epc_page;
220
221 ret = __sgx_encl_eldu(encl_page, epc_page, secs_page);
222 if (ret) {
b0c7459b 223 sgx_encl_free_epc_page(epc_page);
1728ab54
JS
224 return ERR_PTR(ret);
225 }
226
227 sgx_free_va_slot(encl_page->va_page, va_offset);
228 list_move(&encl_page->va_page->list, &encl->va_pages);
229 encl_page->desc &= ~SGX_ENCL_PAGE_VA_OFFSET_MASK;
230 encl_page->epc_page = epc_page;
231
232 return epc_page;
233}
234
b3fb517d
RC
235static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
236 struct sgx_encl_page *entry)
3fe0778e 237{
1728ab54 238 struct sgx_epc_page *epc_page;
3fe0778e 239
3fe0778e 240 /* Entry successfully located. */
1728ab54
JS
241 if (entry->epc_page) {
242 if (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)
243 return ERR_PTR(-EBUSY);
244
245 return entry;
246 }
247
248 if (!(encl->secs.epc_page)) {
249 epc_page = sgx_encl_eldu(&encl->secs, NULL);
250 if (IS_ERR(epc_page))
251 return ERR_CAST(epc_page);
252 }
253
254 epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
255 if (IS_ERR(epc_page))
256 return ERR_CAST(epc_page);
257
258 encl->secs_child_cnt++;
259 sgx_mark_page_reclaimable(entry->epc_page);
260
3fe0778e
JS
261 return entry;
262}
263
b3fb517d
RC
264static struct sgx_encl_page *sgx_encl_load_page_in_vma(struct sgx_encl *encl,
265 unsigned long addr,
266 unsigned long vm_flags)
267{
268 unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
269 struct sgx_encl_page *entry;
270
271 entry = xa_load(&encl->page_array, PFN_DOWN(addr));
272 if (!entry)
273 return ERR_PTR(-EFAULT);
274
275 /*
276 * Verify that the page has equal or higher build time
277 * permissions than the VMA permissions (i.e. the subset of {VM_READ,
278 * VM_WRITE, VM_EXECUTE} in vma->vm_flags).
279 */
280 if ((entry->vm_max_prot_bits & vm_prot_bits) != vm_prot_bits)
281 return ERR_PTR(-EFAULT);
282
283 return __sgx_encl_load_page(encl, entry);
284}
285
286struct sgx_encl_page *sgx_encl_load_page(struct sgx_encl *encl,
287 unsigned long addr)
288{
289 struct sgx_encl_page *entry;
290
291 entry = xa_load(&encl->page_array, PFN_DOWN(addr));
292 if (!entry)
293 return ERR_PTR(-EFAULT);
294
295 return __sgx_encl_load_page(encl, entry);
296}
297
5a90d2c3
RC
298/**
299 * sgx_encl_eaug_page() - Dynamically add page to initialized enclave
300 * @vma: VMA obtained from fault info from where page is accessed
301 * @encl: enclave accessing the page
302 * @addr: address that triggered the page fault
303 *
304 * When an initialized enclave accesses a page with no backing EPC page
305 * on a SGX2 system then the EPC can be added dynamically via the SGX2
306 * ENCLS[EAUG] instruction.
307 *
308 * Returns: Appropriate vm_fault_t: VM_FAULT_NOPAGE when PTE was installed
309 * successfully, VM_FAULT_SIGBUS or VM_FAULT_OOM as error otherwise.
310 */
311static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
312 struct sgx_encl *encl, unsigned long addr)
313{
314 vm_fault_t vmret = VM_FAULT_SIGBUS;
315 struct sgx_pageinfo pginfo = {0};
316 struct sgx_encl_page *encl_page;
317 struct sgx_epc_page *epc_page;
318 struct sgx_va_page *va_page;
319 unsigned long phys_addr;
320 u64 secinfo_flags;
321 int ret;
322
323 if (!test_bit(SGX_ENCL_INITIALIZED, &encl->flags))
324 return VM_FAULT_SIGBUS;
325
326 /*
327 * Ignore internal permission checking for dynamically added pages.
328 * They matter only for data added during the pre-initialization
329 * phase. The enclave decides the permissions by the means of
330 * EACCEPT, EACCEPTCOPY and EMODPE.
331 */
332 secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X;
333 encl_page = sgx_encl_page_alloc(encl, addr - encl->base, secinfo_flags);
334 if (IS_ERR(encl_page))
335 return VM_FAULT_OOM;
336
337 mutex_lock(&encl->lock);
338
339 epc_page = sgx_alloc_epc_page(encl_page, false);
340 if (IS_ERR(epc_page)) {
341 if (PTR_ERR(epc_page) == -EBUSY)
342 vmret = VM_FAULT_NOPAGE;
343 goto err_out_unlock;
344 }
345
346 va_page = sgx_encl_grow(encl, false);
81fa6fd1
HH
347 if (IS_ERR(va_page)) {
348 if (PTR_ERR(va_page) == -EBUSY)
349 vmret = VM_FAULT_NOPAGE;
5a90d2c3 350 goto err_out_epc;
81fa6fd1 351 }
5a90d2c3
RC
352
353 if (va_page)
354 list_add(&va_page->list, &encl->va_pages);
355
356 ret = xa_insert(&encl->page_array, PFN_DOWN(encl_page->desc),
357 encl_page, GFP_KERNEL);
358 /*
359 * If ret == -EBUSY then page was created in another flow while
360 * running without encl->lock
361 */
362 if (ret)
363 goto err_out_shrink;
364
365 pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page);
366 pginfo.addr = encl_page->desc & PAGE_MASK;
367 pginfo.metadata = 0;
368
369 ret = __eaug(&pginfo, sgx_get_epc_virt_addr(epc_page));
370 if (ret)
371 goto err_out;
372
373 encl_page->encl = encl;
374 encl_page->epc_page = epc_page;
375 encl_page->type = SGX_PAGE_TYPE_REG;
376 encl->secs_child_cnt++;
377
378 sgx_mark_page_reclaimable(encl_page->epc_page);
379
380 phys_addr = sgx_get_epc_phys_addr(epc_page);
381 /*
382 * Do not undo everything when creating PTE entry fails - next #PF
383 * would find page ready for a PTE.
384 */
385 vmret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
386 if (vmret != VM_FAULT_NOPAGE) {
387 mutex_unlock(&encl->lock);
388 return VM_FAULT_SIGBUS;
389 }
390 mutex_unlock(&encl->lock);
391 return VM_FAULT_NOPAGE;
392
393err_out:
394 xa_erase(&encl->page_array, PFN_DOWN(encl_page->desc));
395
396err_out_shrink:
397 sgx_encl_shrink(encl, va_page);
398err_out_epc:
399 sgx_encl_free_epc_page(epc_page);
400err_out_unlock:
401 mutex_unlock(&encl->lock);
402 kfree(encl_page);
403
404 return vmret;
405}
406
3fe0778e
JS
407static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
408{
409 unsigned long addr = (unsigned long)vmf->address;
410 struct vm_area_struct *vma = vmf->vma;
411 struct sgx_encl_page *entry;
412 unsigned long phys_addr;
413 struct sgx_encl *encl;
414 vm_fault_t ret;
415
416 encl = vma->vm_private_data;
417
1728ab54
JS
418 /*
419 * It's very unlikely but possible that allocating memory for the
420 * mm_list entry of a forked process failed in sgx_vma_open(). When
421 * this happens, vm_private_data is set to NULL.
422 */
423 if (unlikely(!encl))
424 return VM_FAULT_SIGBUS;
425
5a90d2c3
RC
426 /*
427 * The page_array keeps track of all enclave pages, whether they
428 * are swapped out or not. If there is no entry for this page and
429 * the system supports SGX2 then it is possible to dynamically add
430 * a new enclave page. This is only possible for an initialized
431 * enclave that will be checked for right away.
432 */
433 if (cpu_feature_enabled(X86_FEATURE_SGX2) &&
434 (!xa_load(&encl->page_array, PFN_DOWN(addr))))
435 return sgx_encl_eaug_page(vma, encl, addr);
436
3fe0778e
JS
437 mutex_lock(&encl->lock);
438
b3fb517d 439 entry = sgx_encl_load_page_in_vma(encl, addr, vma->vm_flags);
3fe0778e
JS
440 if (IS_ERR(entry)) {
441 mutex_unlock(&encl->lock);
442
1728ab54
JS
443 if (PTR_ERR(entry) == -EBUSY)
444 return VM_FAULT_NOPAGE;
445
3fe0778e
JS
446 return VM_FAULT_SIGBUS;
447 }
448
449 phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
450
451 ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
452 if (ret != VM_FAULT_NOPAGE) {
453 mutex_unlock(&encl->lock);
454
455 return VM_FAULT_SIGBUS;
456 }
457
1728ab54 458 sgx_encl_test_and_clear_young(vma->vm_mm, entry);
3fe0778e
JS
459 mutex_unlock(&encl->lock);
460
461 return VM_FAULT_NOPAGE;
462}
463
1728ab54
JS
464static void sgx_vma_open(struct vm_area_struct *vma)
465{
466 struct sgx_encl *encl = vma->vm_private_data;
467
468 /*
469 * It's possible but unlikely that vm_private_data is NULL. This can
470 * happen in a grandchild of a process, when sgx_encl_mm_add() had
471 * failed to allocate memory in this callback.
472 */
473 if (unlikely(!encl))
474 return;
475
476 if (sgx_encl_mm_add(encl, vma->vm_mm))
477 vma->vm_private_data = NULL;
478}
479
480
3fe0778e
JS
481/**
482 * sgx_encl_may_map() - Check if a requested VMA mapping is allowed
483 * @encl: an enclave pointer
484 * @start: lower bound of the address range, inclusive
485 * @end: upper bound of the address range, exclusive
486 * @vm_flags: VMA flags
487 *
488 * Iterate through the enclave pages contained within [@start, @end) to verify
489 * that the permissions requested by a subset of {VM_READ, VM_WRITE, VM_EXEC}
490 * do not contain any permissions that are not contained in the build time
491 * permissions of any of the enclave pages within the given address range.
492 *
493 * An enclave creator must declare the strongest permissions that will be
494 * needed for each enclave page. This ensures that mappings have the identical
495 * or weaker permissions than the earlier declared permissions.
496 *
497 * Return: 0 on success, -EACCES otherwise
498 */
499int sgx_encl_may_map(struct sgx_encl *encl, unsigned long start,
500 unsigned long end, unsigned long vm_flags)
501{
502 unsigned long vm_prot_bits = vm_flags & (VM_READ | VM_WRITE | VM_EXEC);
503 struct sgx_encl_page *page;
504 unsigned long count = 0;
505 int ret = 0;
506
507 XA_STATE(xas, &encl->page_array, PFN_DOWN(start));
508
7b013e72
RC
509 /* Disallow mapping outside enclave's address range. */
510 if (test_bit(SGX_ENCL_INITIALIZED, &encl->flags) &&
511 (start < encl->base || end > encl->base + encl->size))
512 return -EACCES;
513
3fe0778e
JS
514 /*
515 * Disallow READ_IMPLIES_EXEC tasks as their VMA permissions might
516 * conflict with the enclave page permissions.
517 */
518 if (current->personality & READ_IMPLIES_EXEC)
519 return -EACCES;
520
521 mutex_lock(&encl->lock);
522 xas_lock(&xas);
523 xas_for_each(&xas, page, PFN_DOWN(end - 1)) {
524 if (~page->vm_max_prot_bits & vm_prot_bits) {
525 ret = -EACCES;
526 break;
527 }
528
529 /* Reschedule on every XA_CHECK_SCHED iteration. */
530 if (!(++count % XA_CHECK_SCHED)) {
531 xas_pause(&xas);
532 xas_unlock(&xas);
533 mutex_unlock(&encl->lock);
534
535 cond_resched();
536
537 mutex_lock(&encl->lock);
538 xas_lock(&xas);
539 }
540 }
541 xas_unlock(&xas);
542 mutex_unlock(&encl->lock);
543
544 return ret;
545}
546
547static int sgx_vma_mprotect(struct vm_area_struct *vma, unsigned long start,
548 unsigned long end, unsigned long newflags)
549{
550 return sgx_encl_may_map(vma->vm_private_data, start, end, newflags);
551}
552
947c6e11
JS
553static int sgx_encl_debug_read(struct sgx_encl *encl, struct sgx_encl_page *page,
554 unsigned long addr, void *data)
555{
556 unsigned long offset = addr & ~PAGE_MASK;
557 int ret;
558
559
560 ret = __edbgrd(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
561 if (ret)
562 return -EIO;
563
564 return 0;
565}
566
567static int sgx_encl_debug_write(struct sgx_encl *encl, struct sgx_encl_page *page,
568 unsigned long addr, void *data)
569{
570 unsigned long offset = addr & ~PAGE_MASK;
571 int ret;
572
573 ret = __edbgwr(sgx_get_epc_virt_addr(page->epc_page) + offset, data);
574 if (ret)
575 return -EIO;
576
577 return 0;
578}
579
580/*
581 * Load an enclave page to EPC if required, and take encl->lock.
582 */
583static struct sgx_encl_page *sgx_encl_reserve_page(struct sgx_encl *encl,
584 unsigned long addr,
585 unsigned long vm_flags)
586{
587 struct sgx_encl_page *entry;
588
589 for ( ; ; ) {
590 mutex_lock(&encl->lock);
591
b3fb517d 592 entry = sgx_encl_load_page_in_vma(encl, addr, vm_flags);
947c6e11
JS
593 if (PTR_ERR(entry) != -EBUSY)
594 break;
595
596 mutex_unlock(&encl->lock);
597 }
598
599 if (IS_ERR(entry))
600 mutex_unlock(&encl->lock);
601
602 return entry;
603}
604
605static int sgx_vma_access(struct vm_area_struct *vma, unsigned long addr,
606 void *buf, int len, int write)
607{
608 struct sgx_encl *encl = vma->vm_private_data;
609 struct sgx_encl_page *entry = NULL;
610 char data[sizeof(unsigned long)];
611 unsigned long align;
612 int offset;
613 int cnt;
614 int ret = 0;
615 int i;
616
617 /*
618 * If process was forked, VMA is still there but vm_private_data is set
619 * to NULL.
620 */
621 if (!encl)
622 return -EFAULT;
623
624 if (!test_bit(SGX_ENCL_DEBUG, &encl->flags))
625 return -EFAULT;
626
627 for (i = 0; i < len; i += cnt) {
628 entry = sgx_encl_reserve_page(encl, (addr + i) & PAGE_MASK,
629 vma->vm_flags);
630 if (IS_ERR(entry)) {
631 ret = PTR_ERR(entry);
632 break;
633 }
634
635 align = ALIGN_DOWN(addr + i, sizeof(unsigned long));
636 offset = (addr + i) & (sizeof(unsigned long) - 1);
637 cnt = sizeof(unsigned long) - offset;
638 cnt = min(cnt, len - i);
639
640 ret = sgx_encl_debug_read(encl, entry, align, data);
641 if (ret)
642 goto out;
643
644 if (write) {
645 memcpy(data + offset, buf + i, cnt);
646 ret = sgx_encl_debug_write(encl, entry, align, data);
647 if (ret)
648 goto out;
649 } else {
650 memcpy(buf + i, data + offset, cnt);
651 }
652
653out:
654 mutex_unlock(&encl->lock);
655
656 if (ret)
657 break;
658 }
659
660 return ret < 0 ? ret : i;
661}
662
3fe0778e
JS
663const struct vm_operations_struct sgx_vm_ops = {
664 .fault = sgx_vma_fault,
665 .mprotect = sgx_vma_mprotect,
1728ab54 666 .open = sgx_vma_open,
947c6e11 667 .access = sgx_vma_access,
1728ab54
JS
668};
669
670/**
671 * sgx_encl_release - Destroy an enclave instance
1d315639 672 * @ref: address of a kref inside &sgx_encl
1728ab54
JS
673 *
674 * Used together with kref_put(). Frees all the resources associated with the
675 * enclave and the instance itself.
676 */
677void sgx_encl_release(struct kref *ref)
678{
679 struct sgx_encl *encl = container_of(ref, struct sgx_encl, refcount);
680 struct sgx_va_page *va_page;
681 struct sgx_encl_page *entry;
682 unsigned long index;
683
684 xa_for_each(&encl->page_array, index, entry) {
685 if (entry->epc_page) {
686 /*
687 * The page and its radix tree entry cannot be freed
688 * if the page is being held by the reclaimer.
689 */
690 if (sgx_unmark_page_reclaimable(entry->epc_page))
691 continue;
692
b0c7459b 693 sgx_encl_free_epc_page(entry->epc_page);
1728ab54
JS
694 encl->secs_child_cnt--;
695 entry->epc_page = NULL;
696 }
697
698 kfree(entry);
8795359e
RC
699 /* Invoke scheduler to prevent soft lockups. */
700 cond_resched();
1728ab54
JS
701 }
702
703 xa_destroy(&encl->page_array);
704
705 if (!encl->secs_child_cnt && encl->secs.epc_page) {
b0c7459b 706 sgx_encl_free_epc_page(encl->secs.epc_page);
1728ab54
JS
707 encl->secs.epc_page = NULL;
708 }
709
710 while (!list_empty(&encl->va_pages)) {
711 va_page = list_first_entry(&encl->va_pages, struct sgx_va_page,
712 list);
713 list_del(&va_page->list);
b0c7459b 714 sgx_encl_free_epc_page(va_page->epc_page);
1728ab54
JS
715 kfree(va_page);
716 }
717
718 if (encl->backing)
719 fput(encl->backing);
720
721 cleanup_srcu_struct(&encl->srcu);
722
723 WARN_ON_ONCE(!list_empty(&encl->mm_list));
724
725 /* Detect EPC page leak's. */
726 WARN_ON_ONCE(encl->secs_child_cnt);
727 WARN_ON_ONCE(encl->secs.epc_page);
728
729 kfree(encl);
730}
731
732/*
733 * 'mm' is exiting and no longer needs mmu notifications.
734 */
735static void sgx_mmu_notifier_release(struct mmu_notifier *mn,
736 struct mm_struct *mm)
737{
738 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
739 struct sgx_encl_mm *tmp = NULL;
740
741 /*
742 * The enclave itself can remove encl_mm. Note, objects can't be moved
743 * off an RCU protected list, but deletion is ok.
744 */
745 spin_lock(&encl_mm->encl->mm_lock);
746 list_for_each_entry(tmp, &encl_mm->encl->mm_list, list) {
747 if (tmp == encl_mm) {
748 list_del_rcu(&encl_mm->list);
749 break;
750 }
751 }
752 spin_unlock(&encl_mm->encl->mm_lock);
753
754 if (tmp == encl_mm) {
755 synchronize_srcu(&encl_mm->encl->srcu);
756 mmu_notifier_put(mn);
757 }
758}
759
760static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
761{
762 struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
763
2ade0d60
JS
764 /* 'encl_mm' is going away, put encl_mm->encl reference: */
765 kref_put(&encl_mm->encl->refcount, sgx_encl_release);
766
1728ab54
JS
767 kfree(encl_mm);
768}
769
770static const struct mmu_notifier_ops sgx_mmu_notifier_ops = {
771 .release = sgx_mmu_notifier_release,
772 .free_notifier = sgx_mmu_notifier_free,
3fe0778e 773};
1728ab54
JS
774
775static struct sgx_encl_mm *sgx_encl_find_mm(struct sgx_encl *encl,
776 struct mm_struct *mm)
777{
778 struct sgx_encl_mm *encl_mm = NULL;
779 struct sgx_encl_mm *tmp;
780 int idx;
781
782 idx = srcu_read_lock(&encl->srcu);
783
784 list_for_each_entry_rcu(tmp, &encl->mm_list, list) {
785 if (tmp->mm == mm) {
786 encl_mm = tmp;
787 break;
788 }
789 }
790
791 srcu_read_unlock(&encl->srcu, idx);
792
793 return encl_mm;
794}
795
796int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
797{
798 struct sgx_encl_mm *encl_mm;
799 int ret;
800
801 /*
802 * Even though a single enclave may be mapped into an mm more than once,
803 * each 'mm' only appears once on encl->mm_list. This is guaranteed by
804 * holding the mm's mmap lock for write before an mm can be added or
805 * remove to an encl->mm_list.
806 */
807 mmap_assert_write_locked(mm);
808
809 /*
810 * It's possible that an entry already exists in the mm_list, because it
811 * is removed only on VFS release or process exit.
812 */
813 if (sgx_encl_find_mm(encl, mm))
814 return 0;
815
816 encl_mm = kzalloc(sizeof(*encl_mm), GFP_KERNEL);
817 if (!encl_mm)
818 return -ENOMEM;
819
2ade0d60
JS
820 /* Grab a refcount for the encl_mm->encl reference: */
821 kref_get(&encl->refcount);
1728ab54
JS
822 encl_mm->encl = encl;
823 encl_mm->mm = mm;
824 encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;
825
826 ret = __mmu_notifier_register(&encl_mm->mmu_notifier, mm);
827 if (ret) {
828 kfree(encl_mm);
829 return ret;
830 }
831
832 spin_lock(&encl->mm_lock);
833 list_add_rcu(&encl_mm->list, &encl->mm_list);
f89c2f9b 834 /* Pairs with smp_rmb() in sgx_zap_enclave_ptes(). */
1728ab54
JS
835 smp_wmb();
836 encl->mm_list_version++;
837 spin_unlock(&encl->mm_lock);
838
839 return 0;
840}
841
7f391752 842/**
bdaa8799 843 * sgx_encl_cpumask() - Query which CPUs might be accessing the enclave
7f391752
RC
844 * @encl: the enclave
845 *
846 * Some SGX functions require that no cached linear-to-physical address
847 * mappings are present before they can succeed. For example, ENCLS[EWB]
848 * copies a page from the enclave page cache to regular main memory but
849 * it fails if it cannot ensure that there are no cached
850 * linear-to-physical address mappings referring to the page.
851 *
852 * SGX hardware flushes all cached linear-to-physical mappings on a CPU
853 * when an enclave is exited via ENCLU[EEXIT] or an Asynchronous Enclave
854 * Exit (AEX). Exiting an enclave will thus ensure cached linear-to-physical
855 * address mappings are cleared but coordination with the tracking done within
856 * the SGX hardware is needed to support the SGX functions that depend on this
857 * cache clearing.
858 *
859 * When the ENCLS[ETRACK] function is issued on an enclave the hardware
860 * tracks threads operating inside the enclave at that time. The SGX
861 * hardware tracking require that all the identified threads must have
862 * exited the enclave in order to flush the mappings before a function such
863 * as ENCLS[EWB] will be permitted
864 *
865 * The following flow is used to support SGX functions that require that
866 * no cached linear-to-physical address mappings are present:
867 * 1) Execute ENCLS[ETRACK] to initiate hardware tracking.
bdaa8799 868 * 2) Use this function (sgx_encl_cpumask()) to query which CPUs might be
7f391752
RC
869 * accessing the enclave.
870 * 3) Send IPI to identified CPUs, kicking them out of the enclave and
871 * thus flushing all locally cached linear-to-physical address mappings.
872 * 4) Execute SGX function.
873 *
874 * Context: It is required to call this function after ENCLS[ETRACK].
875 * This will ensure that if any new mm appears (racing with
876 * sgx_encl_mm_add()) then the new mm will enter into the
877 * enclave with fresh linear-to-physical address mappings.
878 *
879 * It is required that all IPIs are completed before a new
880 * ENCLS[ETRACK] is issued so be sure to protect steps 1 to 3
881 * of the above flow with the enclave's mutex.
882 *
883 * Return: cpumask of CPUs that might be accessing @encl
884 */
bdaa8799 885const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl)
7f391752
RC
886{
887 cpumask_t *cpumask = &encl->cpumask;
888 struct sgx_encl_mm *encl_mm;
889 int idx;
890
891 cpumask_clear(cpumask);
892
893 idx = srcu_read_lock(&encl->srcu);
894
895 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
896 if (!mmget_not_zero(encl_mm->mm))
897 continue;
898
899 cpumask_or(cpumask, cpumask, mm_cpumask(encl_mm->mm));
900
901 mmput_async(encl_mm->mm);
902 }
903
904 srcu_read_unlock(&encl->srcu, idx);
905
906 return cpumask;
907}
908
1728ab54
JS
909static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
910 pgoff_t index)
911{
912 struct inode *inode = encl->backing->f_path.dentry->d_inode;
913 struct address_space *mapping = inode->i_mapping;
914 gfp_t gfpmask = mapping_gfp_mask(mapping);
915
916 return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
917}
918
919/**
920 * sgx_encl_get_backing() - Pin the backing storage
921 * @encl: an enclave pointer
922 * @page_index: enclave page index
923 * @backing: data for accessing backing storage for the page
924 *
925 * Pin the backing storage pages for storing the encrypted contents and Paging
926 * Crypto MetaData (PCMD) of an enclave page.
927 *
928 * Return:
929 * 0 on success,
930 * -errno otherwise.
931 */
0c9782e2 932static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
1728ab54
JS
933 struct sgx_backing *backing)
934{
08999b24 935 pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
1728ab54
JS
936 struct page *contents;
937 struct page *pcmd;
938
939 contents = sgx_encl_get_backing_page(encl, page_index);
940 if (IS_ERR(contents))
941 return PTR_ERR(contents);
942
08999b24 943 pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off));
1728ab54
JS
944 if (IS_ERR(pcmd)) {
945 put_page(contents);
946 return PTR_ERR(pcmd);
947 }
948
1728ab54
JS
949 backing->contents = contents;
950 backing->pcmd = pcmd;
08999b24 951 backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
1728ab54
JS
952
953 return 0;
954}
955
0c9782e2
KCA
956/*
957 * When called from ksgxd, returns the mem_cgroup of a struct mm stored
958 * in the enclave's mm_list. When not called from ksgxd, just returns
959 * the mem_cgroup of the current task.
960 */
961static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
962{
963 struct mem_cgroup *memcg = NULL;
964 struct sgx_encl_mm *encl_mm;
965 int idx;
966
967 /*
968 * If called from normal task context, return the mem_cgroup
969 * of the current task's mm. The remainder of the handling is for
970 * ksgxd.
971 */
972 if (!current_is_ksgxd())
973 return get_mem_cgroup_from_mm(current->mm);
974
975 /*
976 * Search the enclave's mm_list to find an mm associated with
977 * this enclave to charge the allocation to.
978 */
979 idx = srcu_read_lock(&encl->srcu);
980
981 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
982 if (!mmget_not_zero(encl_mm->mm))
983 continue;
984
985 memcg = get_mem_cgroup_from_mm(encl_mm->mm);
986
987 mmput_async(encl_mm->mm);
988
989 break;
990 }
991
992 srcu_read_unlock(&encl->srcu, idx);
993
994 /*
995 * In the rare case that there isn't an mm associated with
996 * the enclave, set memcg to the current active mem_cgroup.
997 * This will be the root mem_cgroup if there is no active
998 * mem_cgroup.
999 */
1000 if (!memcg)
1001 return get_mem_cgroup_from_mm(NULL);
1002
1003 return memcg;
1004}
1005
1006/**
1007 * sgx_encl_alloc_backing() - allocate a new backing storage page
1008 * @encl: an enclave pointer
1009 * @page_index: enclave page index
1010 * @backing: data for accessing backing storage for the page
1011 *
1012 * When called from ksgxd, sets the active memcg from one of the
1013 * mms in the enclave's mm_list prior to any backing page allocation,
1014 * in order to ensure that shmem page allocations are charged to the
1015 * enclave.
1016 *
1017 * Return:
1018 * 0 on success,
1019 * -errno otherwise.
1020 */
1021int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
1022 struct sgx_backing *backing)
1023{
1024 struct mem_cgroup *encl_memcg = sgx_encl_get_mem_cgroup(encl);
1025 struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
1026 int ret;
1027
1028 ret = sgx_encl_get_backing(encl, page_index, backing);
1029
1030 set_active_memcg(memcg);
1031 mem_cgroup_put(encl_memcg);
1032
1033 return ret;
1034}
1035
1036/**
1037 * sgx_encl_lookup_backing() - retrieve an existing backing storage page
1038 * @encl: an enclave pointer
1039 * @page_index: enclave page index
1040 * @backing: data for accessing backing storage for the page
1041 *
1042 * Retrieve a backing page for loading data back into an EPC page with ELDU.
1043 * It is the caller's responsibility to ensure that it is appropriate to use
1044 * sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
1045 * not used correctly, this will cause an allocation which is not accounted for.
1046 *
1047 * Return:
1048 * 0 on success,
1049 * -errno otherwise.
1050 */
1051int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
1052 struct sgx_backing *backing)
1053{
1054 return sgx_encl_get_backing(encl, page_index, backing);
1055}
1056
1728ab54
JS
1057/**
1058 * sgx_encl_put_backing() - Unpin the backing storage
1059 * @backing: data for accessing backing storage for the page
1728ab54 1060 */
6bd42964 1061void sgx_encl_put_backing(struct sgx_backing *backing)
1728ab54 1062{
1728ab54
JS
1063 put_page(backing->pcmd);
1064 put_page(backing->contents);
1065}
1066
1067static int sgx_encl_test_and_clear_young_cb(pte_t *ptep, unsigned long addr,
1068 void *data)
1069{
1070 pte_t pte;
1071 int ret;
1072
1073 ret = pte_young(*ptep);
1074 if (ret) {
1075 pte = pte_mkold(*ptep);
1076 set_pte_at((struct mm_struct *)data, addr, ptep, pte);
1077 }
1078
1079 return ret;
1080}
1081
1082/**
1083 * sgx_encl_test_and_clear_young() - Test and reset the accessed bit
1084 * @mm: mm_struct that is checked
1085 * @page: enclave page to be tested for recent access
1086 *
1087 * Checks the Access (A) bit from the PTE corresponding to the enclave page and
1088 * clears it.
1089 *
1090 * Return: 1 if the page has been recently accessed and 0 if not.
1091 */
1092int sgx_encl_test_and_clear_young(struct mm_struct *mm,
1093 struct sgx_encl_page *page)
1094{
1095 unsigned long addr = page->desc & PAGE_MASK;
1096 struct sgx_encl *encl = page->encl;
1097 struct vm_area_struct *vma;
1098 int ret;
1099
1100 ret = sgx_encl_find(mm, addr, &vma);
1101 if (ret)
1102 return 0;
1103
1104 if (encl != vma->vm_private_data)
1105 return 0;
1106
1107 ret = apply_to_page_range(vma->vm_mm, addr, PAGE_SIZE,
1108 sgx_encl_test_and_clear_young_cb, vma->vm_mm);
1109 if (ret < 0)
1110 return 0;
1111
1112 return ret;
1113}
1114
8123073c
JS
1115struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl,
1116 unsigned long offset,
1117 u64 secinfo_flags)
1118{
1119 struct sgx_encl_page *encl_page;
1120 unsigned long prot;
1121
1122 encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL);
1123 if (!encl_page)
1124 return ERR_PTR(-ENOMEM);
1125
1126 encl_page->desc = encl->base + offset;
1127 encl_page->encl = encl;
1128
1129 prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) |
1130 _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) |
1131 _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC);
1132
1133 /*
1134 * TCS pages must always RW set for CPU access while the SECINFO
1135 * permissions are *always* zero - the CPU ignores the user provided
1136 * values and silently overwrites them with zero permissions.
1137 */
1138 if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS)
1139 prot |= PROT_READ | PROT_WRITE;
1140
1141 /* Calculate maximum of the VM flags for the page. */
1142 encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0);
1143
1144 return encl_page;
1145}
1146
f89c2f9b
RC
1147/**
1148 * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave
1149 * @encl: the enclave
1150 * @addr: page aligned pointer to single page for which PTEs will be removed
1151 *
1152 * Multiple VMAs may have an enclave page mapped. Remove the PTE mapping
1153 * @addr from each VMA. Ensure that page fault handler is ready to handle
1154 * new mappings of @addr before calling this function.
1155 */
1156void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr)
1157{
1158 unsigned long mm_list_version;
1159 struct sgx_encl_mm *encl_mm;
1160 struct vm_area_struct *vma;
1161 int idx, ret;
1162
1163 do {
1164 mm_list_version = encl->mm_list_version;
1165
1166 /* Pairs with smp_wmb() in sgx_encl_mm_add(). */
1167 smp_rmb();
1168
1169 idx = srcu_read_lock(&encl->srcu);
1170
1171 list_for_each_entry_rcu(encl_mm, &encl->mm_list, list) {
1172 if (!mmget_not_zero(encl_mm->mm))
1173 continue;
1174
1175 mmap_read_lock(encl_mm->mm);
1176
1177 ret = sgx_encl_find(encl_mm->mm, addr, &vma);
1178 if (!ret && encl == vma->vm_private_data)
1179 zap_vma_ptes(vma, addr, PAGE_SIZE);
1180
1181 mmap_read_unlock(encl_mm->mm);
1182
1183 mmput_async(encl_mm->mm);
1184 }
1185
1186 srcu_read_unlock(&encl->srcu, idx);
1187 } while (unlikely(encl->mm_list_version != mm_list_version));
1188}
1189
1728ab54
JS
1190/**
1191 * sgx_alloc_va_page() - Allocate a Version Array (VA) page
a76e7f1f
RC
1192 * @reclaim: Reclaim EPC pages directly if none available. Enclave
1193 * mutex should not be held if this is set.
1728ab54
JS
1194 *
1195 * Allocate a free EPC page and convert it to a Version Array (VA) page.
1196 *
1197 * Return:
1198 * a VA page,
1199 * -errno otherwise
1200 */
a76e7f1f 1201struct sgx_epc_page *sgx_alloc_va_page(bool reclaim)
1728ab54
JS
1202{
1203 struct sgx_epc_page *epc_page;
1204 int ret;
1205
a76e7f1f 1206 epc_page = sgx_alloc_epc_page(NULL, reclaim);
1728ab54
JS
1207 if (IS_ERR(epc_page))
1208 return ERR_CAST(epc_page);
1209
1210 ret = __epa(sgx_get_epc_virt_addr(epc_page));
1211 if (ret) {
1212 WARN_ONCE(1, "EPA returned %d (0x%x)", ret, ret);
b0c7459b 1213 sgx_encl_free_epc_page(epc_page);
1728ab54
JS
1214 return ERR_PTR(-EFAULT);
1215 }
1216
1217 return epc_page;
1218}
1219
1220/**
1221 * sgx_alloc_va_slot - allocate a VA slot
1222 * @va_page: a &struct sgx_va_page instance
1223 *
1224 * Allocates a slot from a &struct sgx_va_page instance.
1225 *
1226 * Return: offset of the slot inside the VA page
1227 */
1228unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page)
1229{
1230 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
1231
1232 if (slot < SGX_VA_SLOT_COUNT)
1233 set_bit(slot, va_page->slots);
1234
1235 return slot << 3;
1236}
1237
1238/**
1239 * sgx_free_va_slot - free a VA slot
1240 * @va_page: a &struct sgx_va_page instance
1241 * @offset: offset of the slot inside the VA page
1242 *
1243 * Frees a slot from a &struct sgx_va_page instance.
1244 */
1245void sgx_free_va_slot(struct sgx_va_page *va_page, unsigned int offset)
1246{
1247 clear_bit(offset >> 3, va_page->slots);
1248}
1249
1250/**
1251 * sgx_va_page_full - is the VA page full?
1252 * @va_page: a &struct sgx_va_page instance
1253 *
1254 * Return: true if all slots have been taken
1255 */
1256bool sgx_va_page_full(struct sgx_va_page *va_page)
1257{
1258 int slot = find_first_zero_bit(va_page->slots, SGX_VA_SLOT_COUNT);
1259
1260 return slot == SGX_VA_SLOT_COUNT;
1261}
b0c7459b
KH
1262
1263/**
1264 * sgx_encl_free_epc_page - free an EPC page assigned to an enclave
1265 * @page: EPC page to be freed
1266 *
1267 * Free an EPC page assigned to an enclave. It does EREMOVE for the page, and
1268 * only upon success, it puts the page back to free page list. Otherwise, it
1269 * gives a WARNING to indicate page is leaked.
1270 */
1271void sgx_encl_free_epc_page(struct sgx_epc_page *page)
1272{
1273 int ret;
1274
1275 WARN_ON_ONCE(page->flags & SGX_EPC_PAGE_RECLAIMER_TRACKED);
1276
1277 ret = __eremove(sgx_get_epc_virt_addr(page));
1278 if (WARN_ONCE(ret, EREMOVE_ERROR_MESSAGE, ret, ret))
1279 return;
1280
1281 sgx_free_epc_page(page);
1282}