]> git.ipfire.org Git - thirdparty/gcc.git/commitdiff
'-foffload-memory=pinned' using offloading device interfaces
authorThomas Schwinge <thomas@codesourcery.com>
Thu, 30 Mar 2023 08:08:12 +0000 (10:08 +0200)
committerThomas Schwinge <thomas@codesourcery.com>
Mon, 3 Apr 2023 14:43:02 +0000 (16:43 +0200)
Implemented for nvptx offloading via 'cuMemHostAlloc', 'cuMemHostRegister'.

gcc/
* doc/invoke.texi (-foffload-memory=pinned): Document.
include/
* cuda/cuda.h (CUresult): Add
'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED'.
(CUdevice_attribute): Add
'CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED'.
(CU_MEMHOSTREGISTER_READ_ONLY): Add.
(cuMemHostGetFlags, cuMemHostRegister, cuMemHostUnregister): Add.
libgomp/
* libgomp-plugin.h (GOMP_OFFLOAD_page_locked_host_free): Add
'struct goacc_asyncqueue *' formal parameter.
(GOMP_OFFLOAD_page_locked_host_register)
(GOMP_OFFLOAD_page_locked_host_unregister)
(GOMP_OFFLOAD_page_locked_host_p): Add.
* libgomp.h (always_pinned_mode)
(gomp_page_locked_host_register_dev)
(gomp_page_locked_host_unregister_dev): Add.
(struct splay_tree_key_s): Add 'page_locked_host_p'.
(struct gomp_device_descr): Add
'GOMP_OFFLOAD_page_locked_host_register',
'GOMP_OFFLOAD_page_locked_host_unregister',
'GOMP_OFFLOAD_page_locked_host_p'.
* libgomp.texi (-foffload-memory=pinned): Document.
* plugin/cuda-lib.def (cuMemHostGetFlags, cuMemHostRegister_v2)
(cuMemHostRegister, cuMemHostUnregister): Add.
* plugin/plugin-nvptx.c (struct ptx_device): Add
'read_only_host_register_supported'.
(nvptx_open_device): Initialize it.
(free_host_blocks, free_host_blocks_lock)
(nvptx_run_deferred_page_locked_host_free)
(nvptx_page_locked_host_free_callback, nvptx_page_locked_host_p)
(GOMP_OFFLOAD_page_locked_host_register)
(nvptx_page_locked_host_unregister_callback)
(GOMP_OFFLOAD_page_locked_host_unregister)
(GOMP_OFFLOAD_page_locked_host_p)
(nvptx_run_deferred_page_locked_host_unregister)
(nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback):
Add.
(GOMP_OFFLOAD_fini_device, GOMP_OFFLOAD_page_locked_host_alloc)
(GOMP_OFFLOAD_run): Call
'nvptx_run_deferred_page_locked_host_free'.
(struct goacc_asyncqueue): Add
'page_locked_host_unregister_blocks_lock',
'page_locked_host_unregister_blocks'.
(nvptx_goacc_asyncqueue_construct)
(nvptx_goacc_asyncqueue_destruct): Handle those.
(GOMP_OFFLOAD_page_locked_host_free): Handle
'struct goacc_asyncqueue *' formal parameter.
(GOMP_OFFLOAD_openacc_async_test)
(nvptx_goacc_asyncqueue_synchronize): Call
'nvptx_run_deferred_page_locked_host_unregister'.
(GOMP_OFFLOAD_openacc_async_serialize): Call
'nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback'.
* config/linux/allocator.c (linux_memspace_alloc)
(linux_memspace_calloc, linux_memspace_free)
(linux_memspace_realloc): Remove 'always_pinned_mode' handling.
(GOMP_enable_pinned_mode): Move...
* target.c: ... here.
(always_pinned_mode, verify_always_pinned_mode)
(gomp_verify_always_pinned_mode, gomp_page_locked_host_alloc_dev)
(gomp_page_locked_host_free_dev)
(gomp_page_locked_host_aligned_alloc_dev)
(gomp_page_locked_host_aligned_free_dev)
(gomp_page_locked_host_register_dev)
(gomp_page_locked_host_unregister_dev): Add.
(gomp_copy_host2dev, gomp_map_vars_internal)
(gomp_remove_var_internal, gomp_unmap_vars_internal)
(get_gomp_offload_icvs, gomp_load_image_to_device)
(gomp_target_rev, omp_target_memcpy_copy)
(omp_target_memcpy_rect_worker): Handle 'always_pinned_mode'.
(gomp_copy_host2dev, gomp_copy_dev2host): Handle
'verify_always_pinned_mode'.
(GOMP_target_ext): Add 'assert'.
(gomp_page_locked_host_alloc): Use
'gomp_page_locked_host_alloc_dev'.
(gomp_page_locked_host_free): Use
'gomp_page_locked_host_free_dev'.
(omp_target_associate_ptr): Adjust.
(gomp_load_plugin_for_device): Handle 'page_locked_host_register',
'page_locked_host_unregister', 'page_locked_host_p'.
* oacc-mem.c (memcpy_tofrom_device): Handle 'always_pinned_mode'.
* libgomp_g.h (GOMP_enable_pinned_mode): Adjust.
* testsuite/libgomp.c/alloc-pinned-7.c: Remove.

15 files changed:
gcc/ChangeLog.omp
gcc/doc/invoke.texi
include/ChangeLog.omp
include/cuda/cuda.h
libgomp/ChangeLog.omp
libgomp/config/linux/allocator.c
libgomp/libgomp-plugin.h
libgomp/libgomp.h
libgomp/libgomp.texi
libgomp/libgomp_g.h
libgomp/oacc-mem.c
libgomp/plugin/cuda-lib.def
libgomp/plugin/plugin-nvptx.c
libgomp/target.c
libgomp/testsuite/libgomp.c/alloc-pinned-7.c [deleted file]

index 5e76158db068a8ba609c09d9774f8637e31e3501..d8aa0ab51bf55763dde38d7812015f9d8cade866 100644 (file)
@@ -1,3 +1,7 @@
+2023-04-03  Thomas Schwinge  <thomas@codesourcery.com>
+
+       * doc/invoke.texi (-foffload-memory=pinned): Document.
+
 2023-03-31  Frederik Harwath  <frederik@codesourcery.com>
 
        * omp-transform-loops.cc (walk_omp_for_loops): Handle
index 1fe047042ae0a035cd3ee633c644fc50cdc65054..070b63030f89ae72e659235d0f978be9dd3517fd 100644 (file)
@@ -2711,13 +2711,28 @@ Typical command lines are
 @itemx -foffload-memory=unified
 @itemx -foffload-memory=pinned
 @opindex foffload-memory
+@cindex Offloading memory modes
 @cindex OpenMP offloading memory modes
+
 Enable a memory optimization mode to use with OpenMP.  The default behavior,
 @option{-foffload-memory=none}, is to do nothing special (unless enabled via
 a requires directive in the code).  @option{-foffload-memory=unified} is
 equivalent to @code{#pragma omp requires unified_shared_memory}.
-@option{-foffload-memory=pinned} forces all host memory to be pinned (this
-mode may require the user to increase the ulimit setting for locked memory).
+
+@c The following paragraph is duplicated in
+@c '../../libgomp/libgomp.texi', '-foffload-memory=pinned'.
+If supported by the active offloading device,
+@option{-foffload-memory=pinned} enables automatic use of page-locked
+host memory for memory objects participating in host <-> device memory
+transfers, for both OpenACC and OpenMP offloading.
+Such memory is allocated or registered using the respective offloading
+device interfaces, which potentially helps optimization of host <->
+device data transfers.
+This option is experimental.
+Beware that use of a lot of pinned memory may degrade overall system
+performance, as it does reduce the amount of host memory available for
+paging.
+
 All translation units must select the same setting to avoid undefined
 behavior.
 
index 244d67e660824fc18b392e8671930f66714724bc..655377a6d0dc1ea2728fe5636cbd9b21d32ff8c6 100644 (file)
@@ -1,3 +1,12 @@
+2023-04-03  Thomas Schwinge  <thomas@codesourcery.com>
+
+       * cuda/cuda.h (CUresult): Add
+       'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED'.
+       (CUdevice_attribute): Add
+       'CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED'.
+       (CU_MEMHOSTREGISTER_READ_ONLY): Add.
+       (cuMemHostGetFlags, cuMemHostRegister, cuMemHostUnregister): Add.
+
 2023-02-20  Thomas Schwinge  <thomas@codesourcery.com>
 
        * cuda/cuda.h (cuMemHostRegister, cuMemHostUnregister): Remove.
index 062d394b95f28c28fc25cd523ee729906962fed5..f8f464607dbf787eae3353ec6cfbba5078ed54ca 100644 (file)
@@ -57,6 +57,7 @@ typedef enum {
   CUDA_ERROR_INVALID_CONTEXT = 201,
   CUDA_ERROR_NOT_FOUND = 500,
   CUDA_ERROR_NOT_READY = 600,
+  CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
   CUDA_ERROR_LAUNCH_FAILED = 719,
   CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720,
   CUDA_ERROR_NOT_PERMITTED = 800,
@@ -80,7 +81,8 @@ typedef enum {
   CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,
   CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,
   CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,
-  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82
+  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,
+  CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113
 } CUdevice_attribute;
 
 typedef enum {
@@ -124,8 +126,11 @@ enum {
 #define CU_LAUNCH_PARAM_END ((void *) 0)
 #define CU_LAUNCH_PARAM_BUFFER_POINTER ((void *) 1)
 #define CU_LAUNCH_PARAM_BUFFER_SIZE ((void *) 2)
+
 #define CU_MEMHOSTALLOC_DEVICEMAP 0x02U
 
+#define CU_MEMHOSTREGISTER_READ_ONLY 0x08
+
 enum {
   CU_STREAM_DEFAULT = 0,
   CU_STREAM_NON_BLOCKING = 1
@@ -183,6 +188,10 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t);
 CUresult cuMemAllocHost (void **, size_t);
 CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int);
 CUresult cuMemHostAlloc (void **, size_t, unsigned int);
+CUresult cuMemHostGetFlags (unsigned int *, void *);
+#define cuMemHostRegister cuMemHostRegister_v2
+CUresult cuMemHostRegister(void *, size_t, unsigned int);
+CUresult cuMemHostUnregister(void *);
 CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t);
 #define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
 CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream);
index 7afb5f43c042a8a2567e8afd53037fb1b3f95def..1b02c057562feac688b4fb0068b00132201f0830 100644 (file)
@@ -1,5 +1,80 @@
 2023-04-03  Thomas Schwinge  <thomas@codesourcery.com>
 
+       * libgomp-plugin.h (GOMP_OFFLOAD_page_locked_host_free): Add
+       'struct goacc_asyncqueue *' formal parameter.
+       (GOMP_OFFLOAD_page_locked_host_register)
+       (GOMP_OFFLOAD_page_locked_host_unregister)
+       (GOMP_OFFLOAD_page_locked_host_p): Add.
+       * libgomp.h (always_pinned_mode)
+       (gomp_page_locked_host_register_dev)
+       (gomp_page_locked_host_unregister_dev): Add.
+       (struct splay_tree_key_s): Add 'page_locked_host_p'.
+       (struct gomp_device_descr): Add
+       'GOMP_OFFLOAD_page_locked_host_register',
+       'GOMP_OFFLOAD_page_locked_host_unregister',
+       'GOMP_OFFLOAD_page_locked_host_p'.
+       * libgomp.texi (-foffload-memory=pinned): Document.
+       * plugin/cuda-lib.def (cuMemHostGetFlags, cuMemHostRegister_v2)
+       (cuMemHostRegister, cuMemHostUnregister): Add.
+       * plugin/plugin-nvptx.c (struct ptx_device): Add
+       'read_only_host_register_supported'.
+       (nvptx_open_device): Initialize it.
+       (free_host_blocks, free_host_blocks_lock)
+       (nvptx_run_deferred_page_locked_host_free)
+       (nvptx_page_locked_host_free_callback, nvptx_page_locked_host_p)
+       (GOMP_OFFLOAD_page_locked_host_register)
+       (nvptx_page_locked_host_unregister_callback)
+       (GOMP_OFFLOAD_page_locked_host_unregister)
+       (GOMP_OFFLOAD_page_locked_host_p)
+       (nvptx_run_deferred_page_locked_host_unregister)
+       (nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback):
+       Add.
+       (GOMP_OFFLOAD_fini_device, GOMP_OFFLOAD_page_locked_host_alloc)
+       (GOMP_OFFLOAD_run): Call
+       'nvptx_run_deferred_page_locked_host_free'.
+       (struct goacc_asyncqueue): Add
+       'page_locked_host_unregister_blocks_lock',
+       'page_locked_host_unregister_blocks'.
+       (nvptx_goacc_asyncqueue_construct)
+       (nvptx_goacc_asyncqueue_destruct): Handle those.
+       (GOMP_OFFLOAD_page_locked_host_free): Handle
+       'struct goacc_asyncqueue *' formal parameter.
+       (GOMP_OFFLOAD_openacc_async_test)
+       (nvptx_goacc_asyncqueue_synchronize): Call
+       'nvptx_run_deferred_page_locked_host_unregister'.
+       (GOMP_OFFLOAD_openacc_async_serialize): Call
+       'nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback'.
+       * config/linux/allocator.c (linux_memspace_alloc)
+       (linux_memspace_calloc, linux_memspace_free)
+       (linux_memspace_realloc): Remove 'always_pinned_mode' handling.
+       (GOMP_enable_pinned_mode): Move...
+       * target.c: ... here.
+       (always_pinned_mode, verify_always_pinned_mode)
+       (gomp_verify_always_pinned_mode, gomp_page_locked_host_alloc_dev)
+       (gomp_page_locked_host_free_dev)
+       (gomp_page_locked_host_aligned_alloc_dev)
+       (gomp_page_locked_host_aligned_free_dev)
+       (gomp_page_locked_host_register_dev)
+       (gomp_page_locked_host_unregister_dev): Add.
+       (gomp_copy_host2dev, gomp_map_vars_internal)
+       (gomp_remove_var_internal, gomp_unmap_vars_internal)
+       (get_gomp_offload_icvs, gomp_load_image_to_device)
+       (gomp_target_rev, omp_target_memcpy_copy)
+       (omp_target_memcpy_rect_worker): Handle 'always_pinned_mode'.
+       (gomp_copy_host2dev, gomp_copy_dev2host): Handle
+       'verify_always_pinned_mode'.
+       (GOMP_target_ext): Add 'assert'.
+       (gomp_page_locked_host_alloc): Use
+       'gomp_page_locked_host_alloc_dev'.
+       (gomp_page_locked_host_free): Use
+       'gomp_page_locked_host_free_dev'.
+       (omp_target_associate_ptr): Adjust.
+       (gomp_load_plugin_for_device): Handle 'page_locked_host_register',
+       'page_locked_host_unregister', 'page_locked_host_p'.
+       * oacc-mem.c (memcpy_tofrom_device): Handle 'always_pinned_mode'.
+       * libgomp_g.h (GOMP_enable_pinned_mode): Adjust.
+       * testsuite/libgomp.c/alloc-pinned-7.c: Remove.
+
        PR other/76739
        * target.c (gomp_map_vars_internal): Pass pre-allocated 'ptrblock'
        to 'goacc_noncontig_array_create_ptrblock'.
index 3e1bd5a128545a90fb6a28d57ead2342f4e1f875..62649f64221fd8298be57e6ce691a258b0c97705 100644 (file)
 #include <assert.h>
 #include "libgomp.h"
 
-static bool always_pinned_mode = false;
-
-/* This function is called by the compiler when -foffload-memory=pinned
-   is used.  */
-
-void
-GOMP_enable_pinned_mode ()
-{
-  if (mlockall (MCL_CURRENT | MCL_FUTURE) != 0)
-    gomp_error ("failed to pin all memory (ulimit too low?)");
-  else
-    always_pinned_mode = true;
-}
-
 static int using_device_for_page_locked
   = /* uninitialized */ -1;
 
@@ -70,9 +56,6 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin,
              __FUNCTION__, (unsigned long long) memspace,
              (unsigned long long) size, pin, init0);
 
-  /* Explicit pinning may not be required.  */
-  pin = pin && !always_pinned_mode;
-
   void *addr;
 
   if (memspace == ompx_unified_shared_mem_space)
@@ -137,9 +120,6 @@ linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
   gomp_debug (0, "%s: memspace=%llu, size=%llu, pin=%d\n",
              __FUNCTION__, (unsigned long long) memspace, (unsigned long long) size, pin);
 
-  /* Explicit pinning may not be required.  */
-  pin = pin && !always_pinned_mode;
-
   if (memspace == ompx_unified_shared_mem_space)
     {
       void *ret = gomp_usm_alloc (size, GOMP_DEVICE_ICV);
@@ -159,9 +139,6 @@ linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
   gomp_debug (0, "%s: memspace=%llu, addr=%p, size=%llu, pin=%d\n",
              __FUNCTION__, (unsigned long long) memspace, addr, (unsigned long long) size, pin);
 
-  /* Explicit pinning may not be required.  */
-  pin = pin && !always_pinned_mode;
-
   if (memspace == ompx_unified_shared_mem_space)
     gomp_usm_free (addr, GOMP_DEVICE_ICV);
   else if (pin)
@@ -188,9 +165,6 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
   gomp_debug (0, "%s: memspace=%llu, addr=%p, oldsize=%llu, size=%llu, oldpin=%d, pin=%d\n",
              __FUNCTION__, (unsigned long long) memspace, addr, (unsigned long long) oldsize, (unsigned long long) size, oldpin, pin);
 
-  /* Explicit pinning may not be required.  */
-  pin = pin && !always_pinned_mode;
-
   if (memspace == ompx_unified_shared_mem_space)
     goto manual_realloc;
   else if (oldpin && pin)
index ca557a79380e50704d4950d1a9c7a485cdf5df1c..7456b7d1026ab873d3da47e03777b9ce5d7f7991 100644 (file)
@@ -141,7 +141,12 @@ extern void *GOMP_OFFLOAD_usm_alloc (int, size_t);
 extern bool GOMP_OFFLOAD_usm_free (int, void *);
 extern bool GOMP_OFFLOAD_is_usm_ptr (void *);
 extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t);
-extern bool GOMP_OFFLOAD_page_locked_host_free (void *);
+extern bool GOMP_OFFLOAD_page_locked_host_free (void *,
+                                               struct goacc_asyncqueue *);
+extern int GOMP_OFFLOAD_page_locked_host_register (int, void *, size_t, int);
+extern bool GOMP_OFFLOAD_page_locked_host_unregister (void *, size_t,
+                                                     struct goacc_asyncqueue *);
+extern int GOMP_OFFLOAD_page_locked_host_p (int, const void *, size_t);
 extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
 extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
index 3b2b4aa95347452b1659439a136b5101027e6d73..b7ac9d3da5b27893a29fd7d2384bc6770d96d653 100644 (file)
@@ -1123,6 +1123,8 @@ extern int gomp_pause_host (void);
 
 /* target.c */
 
+extern bool always_pinned_mode;
+
 extern void gomp_init_targets_once (void);
 extern int gomp_get_num_devices (void);
 extern bool gomp_target_task_fn (void *);
@@ -1130,6 +1132,11 @@ extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
                             int, struct goacc_asyncqueue *);
 extern void * gomp_usm_alloc (size_t size, int device_num);
 extern void gomp_usm_free (void *device_ptr, int device_num);
+extern int gomp_page_locked_host_register_dev (struct gomp_device_descr *,
+                                              void *, size_t, int);
+extern bool gomp_page_locked_host_unregister_dev (struct gomp_device_descr *,
+                                                 void *, size_t,
+                                                 struct goacc_asyncqueue *);
 extern bool gomp_page_locked_host_alloc (void **, size_t);
 extern void gomp_page_locked_host_free (void *);
 
@@ -1232,6 +1239,9 @@ struct splay_tree_key_s {
     uintptr_t *structelem_refcount_ptr;
   };
   struct splay_tree_aux *aux;
+  /* Whether we have registered page-locked host memory for
+     '[host_start, host_end)'.  */
+  bool page_locked_host_p;
 };
 
 /* The comparison function.  */
@@ -1393,6 +1403,11 @@ struct gomp_device_descr
   __typeof (GOMP_OFFLOAD_is_usm_ptr) *is_usm_ptr_func;
   __typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func;
   __typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func;
+  __typeof (GOMP_OFFLOAD_page_locked_host_register)
+       *page_locked_host_register_func;
+  __typeof (GOMP_OFFLOAD_page_locked_host_unregister)
+       *page_locked_host_unregister_func;
+  __typeof (GOMP_OFFLOAD_page_locked_host_p) *page_locked_host_p_func;
   __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
   __typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
   __typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
index 6355ce2a37ba99260eb675cbdd2435b9f6242178..df52fd3039cd0bfe332f47e375c7c362ce5710b1 100644 (file)
@@ -4402,10 +4402,41 @@ creating memory allocators requesting
 The following sections present notes on the offload-target specifics
 
 @menu
+* @option{-foffload-memory=pinned}::
 * AMD Radeon::
 * nvptx::
 @end menu
 
+@node @option{-foffload-memory=pinned}
+@section @option{-foffload-memory=pinned}
+
+@c The following paragraph is duplicated from
+@c '../gcc/doc/invoke.texi', '-foffload-memory=pinned'.
+If supported by the active offloading device,
+@option{-foffload-memory=pinned} enables automatic use of page-locked
+host memory for memory objects participating in host <-> device memory
+transfers, for both OpenACC and OpenMP offloading.
+Such memory is allocated or registered using the respective offloading
+device interfaces, which potentially helps optimization of host <->
+device data transfers.
+This option is experimental.
+Beware that use of a lot of pinned memory may degrade overall system
+performance, as it does reduce the amount of host memory available for
+paging.
+
+An OpenACC @emph{async} @code{enter data}-like operation may register
+a memory object as pinned.  After the corresponding @emph{async}
+@code{exit data}-like operation, this registration does last until
+next synchronization point (such as @code{acc_async_synchronize}).
+During this time, the user code must not "touch" the host-side memory
+allocation -- but that does correspond to the @emph{async} semantics
+anyway.
+
+We don't consider @code{-foffload-memory=pinned} for one-time internal
+data transfers, such as setup during device initialization.
+
+
+
 @node AMD Radeon
 @section AMD Radeon (GCN)
 
@@ -4459,6 +4490,8 @@ The implementation remark:
 @item OpenMP @emph{pinned} memory (@code{omp_atk_pinned},
       @code{ompx_pinned_mem_alloc}, for example)
       is allocated via @code{mmap}, @code{mlock}.
+@item @option{-foffload-memory=pinned} is not supported,
+      @pxref{@option{-foffload-memory=pinned}}.
 @end itemize
 
 
@@ -4526,6 +4559,8 @@ The implementation remark:
       is allocated via @code{cuMemHostAlloc} (CUDA Driver API).
       This potentially helps optimization of host <-> device data
       transfers.
+@item @option{-foffload-memory=pinned} is supported,
+      @pxref{@option{-foffload-memory=pinned}}.
 @end itemize
 
 
index fe66a53d94a3b1855b6ed32c4522a66eb03dee2b..2a515ce734884efc39d80d0e0e82b3983a7b3460 100644 (file)
@@ -365,6 +365,7 @@ extern bool GOMP_teams4 (unsigned int, unsigned int, unsigned int, bool);
 
 extern bool GOMP_evaluate_target_device (int, const char *, const char *,
                                         const char *);
+extern void GOMP_enable_pinned_mode (void);
 
 /* teams.c */
 
@@ -375,7 +376,6 @@ extern void GOMP_teams_reg (void (*) (void *), void *, unsigned, unsigned,
 
 extern void *GOMP_alloc (size_t, size_t, uintptr_t);
 extern void GOMP_free (void *, uintptr_t);
-extern void GOMP_enable_pinned_mode (void);
 
 /* error.c */
 
index bd82beefcdb57c366ecc0f21f7ba67a9c91b55fd..75ec8958501fadfbb5c87a3293053c22dd1b1a86 100644 (file)
@@ -199,11 +199,27 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async,
     }
 
   goacc_aq aq = get_goacc_asyncqueue (async);
+
+  int h_page_locked_host_p = 0;
+
+  if (always_pinned_mode
+      && s != 0)
+    {
+      h_page_locked_host_p = gomp_page_locked_host_register_dev
+       (thr->dev, h, s, from ? GOMP_MAP_FROM : GOMP_MAP_TO);
+      if (h_page_locked_host_p < 0)
+       exit (EXIT_FAILURE);
+    }
+
   if (from)
     gomp_copy_dev2host (thr->dev, aq, h, d, s);
   else
     gomp_copy_host2dev (thr->dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL);
 
+  if (h_page_locked_host_p
+      && !gomp_page_locked_host_unregister_dev (thr->dev, h, s, aq))
+    exit (EXIT_FAILURE);
+
   if (profiling_p)
     {
       thr->prof_info = NULL;
index 9b786c9f2f68698fb55cacce09676133866358da..062a141053feb873e66eebc6d2cdf4310a5ee0ae 100644 (file)
@@ -31,6 +31,10 @@ CUDA_ONE_CALL (cuMemAlloc)
 CUDA_ONE_CALL (cuMemAllocHost)
 CUDA_ONE_CALL (cuMemAllocManaged)
 CUDA_ONE_CALL (cuMemHostAlloc)
+CUDA_ONE_CALL (cuMemHostGetFlags)
+CUDA_ONE_CALL_MAYBE_NULL (cuMemHostRegister_v2)
+CUDA_ONE_CALL (cuMemHostRegister)
+CUDA_ONE_CALL (cuMemHostUnregister)
 CUDA_ONE_CALL (cuMemcpy)
 CUDA_ONE_CALL (cuMemcpyDtoDAsync)
 CUDA_ONE_CALL (cuMemcpyDtoH)
index 23f89b6fb34f3524c4759fe0b4fcf4654c5189b4..e57a2b30e6680a5d3dc25c399c32b972e5aa8a2d 100644 (file)
@@ -78,11 +78,14 @@ extern CUresult cuGetErrorString (CUresult, const char **);
 CUresult cuLinkAddData (CUlinkState, CUjitInputType, void *, size_t,
                        const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate (unsigned, CUjit_option *, void **, CUlinkState *);
+#undef cuMemHostRegister
+CUresult cuMemHostRegister (void *, size_t, unsigned int);
 #else
 typedef size_t (*CUoccupancyB2DSize)(int);
 CUresult cuLinkAddData_v2 (CUlinkState, CUjitInputType, void *, size_t,
                           const char *, unsigned, CUjit_option *, void **);
 CUresult cuLinkCreate_v2 (unsigned, CUjit_option *, void **, CUlinkState *);
+CUresult cuMemHostRegister_v2 (void *, size_t, unsigned int);
 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
                                          CUoccupancyB2DSize, size_t, int);
 #endif
@@ -218,6 +221,8 @@ static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 struct goacc_asyncqueue
 {
   CUstream cuda_stream;
+  pthread_mutex_t page_locked_host_unregister_blocks_lock;
+  struct ptx_free_block *page_locked_host_unregister_blocks;
 };
 
 struct nvptx_callback
@@ -314,6 +319,7 @@ struct ptx_device
   int warp_size;
   int max_threads_per_block;
   int max_threads_per_multiprocessor;
+  bool read_only_host_register_supported;
   int default_dims[GOMP_DIM_MAX];
   int compute_major, compute_minor;
 
@@ -340,6 +346,33 @@ struct ptx_device
 
 static struct ptx_device **ptx_devices;
 
+static struct ptx_free_block *free_host_blocks = NULL;
+static pthread_mutex_t free_host_blocks_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static bool
+nvptx_run_deferred_page_locked_host_free (void)
+{
+  GOMP_PLUGIN_debug (0, "%s\n",
+                    __FUNCTION__);
+
+  pthread_mutex_lock (&free_host_blocks_lock);
+  struct ptx_free_block *b = free_host_blocks;
+  free_host_blocks = NULL;
+  pthread_mutex_unlock (&free_host_blocks_lock);
+
+  while (b)
+    {
+      GOMP_PLUGIN_debug (0, "  b=%p: cuMemFreeHost(b->ptr=%p)\n",
+                        b, b->ptr);
+
+      struct ptx_free_block *b_next = b->next;
+      CUDA_CALL (cuMemFreeHost, b->ptr);
+      free (b);
+      b = b_next;
+    }
+  return true;
+}
+
 /* OpenMP kernels reserve a small amount of ".shared" space for use by
    omp_alloc.  The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
    default is set here.  */
@@ -542,6 +575,19 @@ nvptx_open_device (int n)
                         CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
   assert (r == CUDA_SUCCESS && pi);
 
+  /* This is a CUDA 11.1 feature.  */
+  r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi,
+                        CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED,
+                        dev);
+  if (r == CUDA_ERROR_INVALID_VALUE)
+    pi = false;
+  else if (r != CUDA_SUCCESS)
+    {
+      GOMP_PLUGIN_error ("cuDeviceGetAttribute error: %s", cuda_error (r));
+      return NULL;
+    }
+  ptx_dev->read_only_host_register_supported = pi;
+
   for (int i = 0; i != GOMP_DIM_MAX; i++)
     ptx_dev->default_dims[i] = 0;
 
@@ -1278,6 +1324,11 @@ GOMP_OFFLOAD_init_device (int n)
 bool
 GOMP_OFFLOAD_fini_device (int n)
 {
+  /* This isn't related to this specific 'ptx_devices[n]', but is a convenient
+     place to clean up.  */
+  if (!nvptx_run_deferred_page_locked_host_free ())
+    return false;
+
   pthread_mutex_lock (&ptx_dev_lock);
 
   if (ptx_devices[n] != NULL)
@@ -1711,6 +1762,12 @@ GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
   GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu\n",
                     __FUNCTION__, ptr, (unsigned long long) size);
 
+  /* TODO: Maybe running the deferred 'cuMemFreeHost's here is not the best
+     idea, given that we don't know what context we're called from?  (See
+     'GOMP_OFFLOAD_run' reverse offload handling.)  But, where to do it?  */
+  if (!nvptx_run_deferred_page_locked_host_free ())
+    return false;
+
   CUresult r;
 
   unsigned int flags = 0;
@@ -1729,16 +1786,243 @@ GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
   return true;
 }
 
+static void
+nvptx_page_locked_host_free_callback (CUstream stream, CUresult r, void *ptr)
+{
+  GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, ptr=%p\n",
+                    __FUNCTION__, stream, (unsigned) r, ptr);
+
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__, cuda_error (r));
+
+  /* We can't now call 'cuMemFreeHost': we're in a CUDA stream context,
+     where we "must not make any CUDA API calls".
+     And, in particular in an OpenMP 'target' reverse offload context,
+     this may even dead-lock?!  */
+  /* See 'nvptx_free'.  */
+  struct ptx_free_block *n
+    = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
+  GOMP_PLUGIN_debug (0, "  defer; n=%p\n", n);
+  n->ptr = ptr;
+  pthread_mutex_lock (&free_host_blocks_lock);
+  n->next = free_host_blocks;
+  free_host_blocks = n;
+  pthread_mutex_unlock (&free_host_blocks_lock);
+}
+
+bool
+GOMP_OFFLOAD_page_locked_host_free (void *ptr, struct goacc_asyncqueue *aq)
+{
+  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, aq=%p\n",
+                    __FUNCTION__, ptr, aq);
+
+  if (aq)
+    {
+      GOMP_PLUGIN_debug (0, "  aq <-"
+                        " nvptx_page_locked_host_free_callback(ptr)\n");
+      CUDA_CALL (cuStreamAddCallback, aq->cuda_stream,
+                nvptx_page_locked_host_free_callback, ptr, 0);
+    }
+  else
+    CUDA_CALL (cuMemFreeHost, ptr);
+  return true;
+}
+
+static int
+nvptx_page_locked_host_p (const void *ptr, size_t size)
+{
+  GOMP_PLUGIN_debug (0, "%s: ptr=%p, size=%llu\n",
+                    __FUNCTION__, ptr, (unsigned long long) size);
+
+  int ret;
+
+  CUresult r;
+
+  /* Apparently, there exists no CUDA call to query 'PTR + [0, SIZE)'.  Instead
+     of invoking 'cuMemHostGetFlags' SIZE times, we deem it sufficient to only
+     query the base PTR.  */
+  unsigned int flags;
+  void *ptr_noconst = (void *) ptr;
+  r = CUDA_CALL_NOCHECK (cuMemHostGetFlags, &flags, ptr_noconst);
+  (void) flags;
+  if (r == CUDA_SUCCESS)
+    ret = 1;
+  else if (r == CUDA_ERROR_INVALID_VALUE)
+    ret = 0;
+  else
+    {
+      GOMP_PLUGIN_error ("cuMemHostGetFlags error: %s", cuda_error (r));
+      ret = -1;
+    }
+  GOMP_PLUGIN_debug (0, "  -> %d (with r = %u)\n",
+                    ret, (unsigned) r);
+  return ret;
+}
+
+int
+GOMP_OFFLOAD_page_locked_host_register (int ord,
+                                       void *ptr, size_t size, int kind)
+{
+  bool try_read_only;
+  /* Magic number: if the actualy mapping kind is unknown...  */
+  if (kind == -1)
+    /* ..., allow for trying read-only registration here.  */
+    try_read_only = true;
+  else
+    try_read_only = !GOMP_MAP_COPY_FROM_P (kind);
+  GOMP_PLUGIN_debug (0, "nvptx %s: ord=%d, ptr=%p, size=%llu,"
+                    " kind=%d (try_read_only=%d)\n",
+                    __FUNCTION__, ord, ptr, (unsigned long long) size,
+                    kind, try_read_only);
+  assert (size != 0);
+
+  if (!nvptx_attach_host_thread_to_device (ord))
+    return -1;
+  struct ptx_device *ptx_dev = ptx_devices[ord];
+
+  int ret = -1;
+
+  CUresult r;
+
+  unsigned int flags = 0;
+  /* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
+     'flags |= CU_MEMHOSTREGISTER_PORTABLE;' here.  */
+ cuMemHostRegister:
+  if (CUDA_CALL_EXISTS (cuMemHostRegister_v2))
+    r = CUDA_CALL_NOCHECK (cuMemHostRegister_v2, ptr, size, flags);
+  else
+    r = CUDA_CALL_NOCHECK (cuMemHostRegister, ptr, size, flags);
+  if (r == CUDA_SUCCESS)
+    ret = 1;
+  else if (r == CUDA_ERROR_INVALID_VALUE)
+    {
+      /* For example, for 'cuMemHostAlloc' (via the user code, for example)
+        followed by 'cuMemHostRegister' (via 'always_pinned_mode', for
+        example), we don't get 'CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED' but
+        'CUDA_ERROR_INVALID_VALUE'.  */
+      if (nvptx_page_locked_host_p (ptr, size))
+       /* Accept the case that the region already is page-locked.  */
+       ret = 0;
+      /* Depending on certain circumstances (see 'cuMemHostRegister'
+        documentation), for example, 'const' data that is placed in section
+        '.rodata' may need 'flags |= CU_MEMHOSTREGISTER_READ_ONLY;', to avoid
+        'CUDA_ERROR_INVALID_VALUE'.  If running into that, we now apply/re-try
+        lazily instead of actively setting it above, to avoid the following
+        problem.  Supposedly/observably (but, not documented), if part of a
+        memory page has been registered without 'CU_MEMHOSTREGISTER_READ_ONLY'
+        and we then try to register another part with
+        'CU_MEMHOSTREGISTER_READ_ONLY', we'll get 'CUDA_ERROR_INVALID_VALUE'.
+        In that case, we can solve the issue by re-trying with
+        'CU_MEMHOSTREGISTER_READ_ONLY' masked out.  However, if part of a
+        memory page has been registered with 'CU_MEMHOSTREGISTER_READ_ONLY'
+        and we then try to register another part without
+        'CU_MEMHOSTREGISTER_READ_ONLY', that latter part apparently inherits
+        the former's 'CU_MEMHOSTREGISTER_READ_ONLY' (and any device to host
+        copy then fails).  We can't easily resolve that situation
+        retroactively, that is, we can't easily re-register the first
+        'CU_MEMHOSTREGISTER_READ_ONLY' part without that flag.  */
+      else if (!(flags & CU_MEMHOSTREGISTER_READ_ONLY)
+              && try_read_only
+              && ptx_dev->read_only_host_register_supported)
+       {
+         GOMP_PLUGIN_debug (0, "  flags |= CU_MEMHOSTREGISTER_READ_ONLY;\n");
+         flags |= CU_MEMHOSTREGISTER_READ_ONLY;
+         goto cuMemHostRegister;
+       }
+      /* We ought to use 'CU_MEMHOSTREGISTER_READ_ONLY', but it's not
+        available.  */
+      else if (try_read_only
+              && !ptx_dev->read_only_host_register_supported)
+       {
+         assert (!(flags & CU_MEMHOSTREGISTER_READ_ONLY));
+         GOMP_PLUGIN_debug (0, "  punt;"
+                            " CU_MEMHOSTREGISTER_READ_ONLY not available\n");
+         /* Accept this (legacy) case; we can't (easily) register page-locked
+            this region of host memory.  */
+         ret = 0;
+       }
+    }
+  else if (r == CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
+    {
+      /* 'cuMemHostRegister' (via the user code, for example) followed by
+        another (potentially partially overlapping) 'cuMemHostRegister'
+        (via 'always_pinned_mode', for example).  */
+      /* Accept this case in good faith; do not verify further.  */
+      ret = 0;
+    }
+  if (ret == -1)
+    GOMP_PLUGIN_error ("cuMemHostRegister error: %s", cuda_error (r));
+  GOMP_PLUGIN_debug (0, "  -> %d (with r = %u)\n",
+                    ret, (unsigned) r);
+  return ret;
+}
+
+static void
+nvptx_page_locked_host_unregister_callback (CUstream stream, CUresult r,
+                                           void *b_)
+{
+  void **b = b_;
+  struct goacc_asyncqueue *aq = b[0];
+  void *ptr = b[1];
+  GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, b_=%p (aq=%p, ptr=%p)\n",
+                    __FUNCTION__, stream, (unsigned) r, b_, aq, ptr);
+
+  free (b_);
+
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__, cuda_error (r));
+
+  /* We can't now call 'cuMemHostUnregister': we're in a CUDA stream context,
+     where we "must not make any CUDA API calls".  */
+  /* See 'nvptx_free'.  */
+  struct ptx_free_block *n
+    = GOMP_PLUGIN_malloc (sizeof (struct ptx_free_block));
+  GOMP_PLUGIN_debug (0, "  defer; n=%p\n", n);
+  n->ptr = ptr;
+  pthread_mutex_lock (&aq->page_locked_host_unregister_blocks_lock);
+  n->next = aq->page_locked_host_unregister_blocks;
+  aq->page_locked_host_unregister_blocks = n;
+  pthread_mutex_unlock (&aq->page_locked_host_unregister_blocks_lock);
+}
+
 bool
-GOMP_OFFLOAD_page_locked_host_free (void *ptr)
+GOMP_OFFLOAD_page_locked_host_unregister (void *ptr, size_t size,
+                                         struct goacc_asyncqueue *aq)
 {
-  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p\n",
-                    __FUNCTION__, ptr);
+  GOMP_PLUGIN_debug (0, "nvptx %s: ptr=%p, size=%llu, aq=%p\n",
+                    __FUNCTION__, ptr, (unsigned long long) size, aq);
+  assert (size != 0);
 
-  CUDA_CALL (cuMemFreeHost, ptr);
+  if (aq)
+    {
+      /* We don't unregister right away, as in-flight operations may still
+        benefit from the registration.  */
+      void **b = GOMP_PLUGIN_malloc (2 * sizeof (*b));
+      b[0] = aq;
+      b[1] = ptr;
+      GOMP_PLUGIN_debug (0, "  aq <-"
+                        " nvptx_page_locked_host_unregister_callback(b=%p)\n",
+                        b);
+      CUDA_CALL (cuStreamAddCallback, aq->cuda_stream,
+                nvptx_page_locked_host_unregister_callback, b, 0);
+    }
+  else
+    CUDA_CALL (cuMemHostUnregister, ptr);
   return true;
 }
 
+int
+GOMP_OFFLOAD_page_locked_host_p (int ord, const void *ptr, size_t size)
+{
+  GOMP_PLUGIN_debug (0, "nvptx %s: ord=%d, ptr=%p, size=%llu\n",
+                    __FUNCTION__, ord, ptr, (unsigned long long) size);
+
+  if (!nvptx_attach_host_thread_to_device (ord))
+    return -1;
+
+  return nvptx_page_locked_host_p (ptr, size);
+}
+
 
 void
 GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
@@ -1841,12 +2125,19 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct goacc_asyncqueue *aq, void *stream)
 static struct goacc_asyncqueue *
 nvptx_goacc_asyncqueue_construct (unsigned int flags)
 {
+  GOMP_PLUGIN_debug (0, "%s: flags=%u\n",
+                    __FUNCTION__, flags);
+
   CUstream stream = NULL;
   CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);
 
   struct goacc_asyncqueue *aq
     = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
   aq->cuda_stream = stream;
+  pthread_mutex_init (&aq->page_locked_host_unregister_blocks_lock, NULL);
+  aq->page_locked_host_unregister_blocks = NULL;
+  GOMP_PLUGIN_debug (0, "  -> aq=%p (with cuda_stream=%p)\n",
+                    aq, aq->cuda_stream);
   return aq;
 }
 
@@ -1859,9 +2150,24 @@ GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
 static bool
 nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
 {
+  GOMP_PLUGIN_debug (0, "nvptx %s: aq=%p\n",
+                    __FUNCTION__, aq);
+
   CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
+
+  bool ret = true;
+  pthread_mutex_lock (&aq->page_locked_host_unregister_blocks_lock);
+  if (aq->page_locked_host_unregister_blocks != NULL)
+    {
+      GOMP_PLUGIN_error ("aq->page_locked_host_unregister_blocks not empty");
+      ret = false;
+    }
+  pthread_mutex_unlock (&aq->page_locked_host_unregister_blocks_lock);
+  pthread_mutex_destroy (&aq->page_locked_host_unregister_blocks_lock);
+
   free (aq);
-  return true;
+
+  return ret;
 }
 
 bool
@@ -1870,12 +2176,50 @@ GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
   return nvptx_goacc_asyncqueue_destruct (aq);
 }
 
+static bool
+nvptx_run_deferred_page_locked_host_unregister (struct goacc_asyncqueue *aq)
+{
+  GOMP_PLUGIN_debug (0, "%s: aq=%p\n",
+                    __FUNCTION__, aq);
+
+  bool ret = true;
+  pthread_mutex_lock (&aq->page_locked_host_unregister_blocks_lock);
+  for (struct ptx_free_block *b = aq->page_locked_host_unregister_blocks; b;)
+    {
+      GOMP_PLUGIN_debug (0, "  b=%p: cuMemHostUnregister(b->ptr=%p)\n",
+                        b, b->ptr);
+
+      struct ptx_free_block *b_next = b->next;
+      CUresult r = CUDA_CALL_NOCHECK (cuMemHostUnregister, b->ptr);
+      if (r != CUDA_SUCCESS)
+       {
+         GOMP_PLUGIN_error ("cuMemHostUnregister error: %s", cuda_error (r));
+         ret = false;
+       }
+      free (b);
+      b = b_next;
+    }
+  aq->page_locked_host_unregister_blocks = NULL;
+  pthread_mutex_unlock (&aq->page_locked_host_unregister_blocks_lock);
+  return ret;
+}
+
 int
 GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
 {
+  GOMP_PLUGIN_debug (0, "nvptx %s: aq=%p\n",
+                    __FUNCTION__, aq);
+
   CUresult r = CUDA_CALL_NOCHECK (cuStreamQuery, aq->cuda_stream);
   if (r == CUDA_SUCCESS)
-    return 1;
+    {
+      /* As a user may expect that they don't need to 'wait' if
+        'acc_async_test' returns 'true', clean up here, too.  */
+      if (!nvptx_run_deferred_page_locked_host_unregister (aq))
+       return -1;
+
+      return 1;
+    }
   if (r == CUDA_ERROR_NOT_READY)
     return 0;
 
@@ -1886,7 +2230,17 @@ GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
 static bool
 nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
 {
+  GOMP_PLUGIN_debug (0, "%s: aq=%p\n",
+                    __FUNCTION__, aq);
+
   CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
+
+  /* This is called from a user code (non-stream) context, and upon returning,
+     we must've given up on any page-locked memory registrations, so unregister
+     any pending ones now.  */
+  if (!nvptx_run_deferred_page_locked_host_unregister (aq))
+    return false;
+
   return true;
 }
 
@@ -1896,14 +2250,70 @@ GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
   return nvptx_goacc_asyncqueue_synchronize (aq);
 }
 
+static void
+nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback
+(CUstream stream, CUresult r, void *b_)
+{
+  void **b = b_;
+  struct goacc_asyncqueue *aq1 = b[0];
+  struct goacc_asyncqueue *aq2 = b[1];
+  GOMP_PLUGIN_debug (0, "%s: stream=%p, r=%u, b_=%p (aq1=%p, aq2=%p)\n",
+                    __FUNCTION__, stream, (unsigned) r, b_, aq1, aq2);
+
+  free (b_);
+
+  if (r != CUDA_SUCCESS)
+    GOMP_PLUGIN_error ("%s error: %s", __FUNCTION__, cuda_error (r));
+
+  pthread_mutex_lock (&aq1->page_locked_host_unregister_blocks_lock);
+  if (aq1->page_locked_host_unregister_blocks)
+    {
+      pthread_mutex_lock (&aq2->page_locked_host_unregister_blocks_lock);
+      GOMP_PLUGIN_debug (0, "  page_locked_host_unregister_blocks:"
+                        " aq1 -> aq2\n");
+      if (aq2->page_locked_host_unregister_blocks == NULL)
+       aq2->page_locked_host_unregister_blocks
+         = aq1->page_locked_host_unregister_blocks;
+      else
+       {
+         struct ptx_free_block *b = aq2->page_locked_host_unregister_blocks;
+         while (b->next != NULL)
+           b = b->next;
+         b->next = aq1->page_locked_host_unregister_blocks;
+       }
+      pthread_mutex_unlock (&aq2->page_locked_host_unregister_blocks_lock);
+      aq1->page_locked_host_unregister_blocks = NULL;
+    }
+  pthread_mutex_unlock (&aq1->page_locked_host_unregister_blocks_lock);
+}
+
 bool
 GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
                                      struct goacc_asyncqueue *aq2)
 {
+  GOMP_PLUGIN_debug (0, "nvptx %s: aq1=%p, aq2=%p\n",
+                    __FUNCTION__, aq1, aq2);
+
+  if (aq1 != aq2)
+    {
+      void **b = GOMP_PLUGIN_malloc (2 * sizeof (*b));
+      b[0] = aq1;
+      b[1] = aq2;
+      /* Enqueue on 'aq1': move 'page_locked_host_unregister_blocks' of 'aq1'
+        to 'aq2'.  */
+      GOMP_PLUGIN_debug (0, "  aq1 <-"
+                        " nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback"
+                        "(b=%p)\n", b);
+      CUDA_CALL (cuStreamAddCallback, aq1->cuda_stream,
+                nvptx_move_page_locked_host_unregister_blocks_aq1_aq2_callback,
+                b, 0);
+    }
+
   CUevent e;
   CUDA_CALL_ERET (false, cuEventCreate, &e, CU_EVENT_DISABLE_TIMING);
   CUDA_CALL_ERET (false, cuEventRecord, e, aq1->cuda_stream);
   CUDA_CALL_ERET (false, cuStreamWaitEvent, aq2->cuda_stream, e, 0);
+
   return true;
 }
 
@@ -2238,6 +2648,19 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
            if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
              exit (EXIT_FAILURE);
            __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
+
+           /* Clean up here; otherwise we may run into the situation that
+              a following reverse offload does
+              'GOMP_OFFLOAD_page_locked_host_alloc', and that then runs the
+              deferred 'cuMemFreeHost's -- which may dead-lock?!
+              TODO: This may need more considerations for the case that
+              different host threads do reverse offload?  We could move
+              'free_host_blocks' into 'aq' (which is separate per reverse
+              offload) instead of global, like
+              'page_locked_host_unregister_blocks', but that doesn't seem the
+              right thing for OpenACC 'async' generally?  */
+           if (!nvptx_run_deferred_page_locked_host_free ())
+             exit (EXIT_FAILURE);
          }
        usleep (1);
       }
index b88b1ebaa139830248aef34ef93ead718355005b..ed2fc09cf44ea1c1c31b67b935c4ad1ae7536ef6 100644 (file)
@@ -108,6 +108,74 @@ static int num_devices_openmp;
 /* OpenMP requires mask.  */
 static int omp_requires_mask;
 
+
+static void *gomp_page_locked_host_alloc_dev (struct gomp_device_descr *,
+                                             size_t, bool);
+static bool gomp_page_locked_host_free_dev (struct gomp_device_descr *,
+                                           void *,
+                                           struct goacc_asyncqueue *);
+static void *gomp_page_locked_host_aligned_alloc_dev (struct gomp_device_descr *,
+                                                     size_t, size_t);
+static bool gomp_page_locked_host_aligned_free_dev (struct gomp_device_descr *,
+                                                   void *,
+                                                   struct goacc_asyncqueue *);
+
+/* Use (that is, allocate or register) page-locked host memory for memory
+   objects participating in host <-> device memory transfers.
+
+   When this is enabled, there is no fallback to non-page-locked host
+   memory.  */
+
+attribute_hidden
+bool always_pinned_mode = false;
+
+/* This function is called by the compiler when -foffload-memory=pinned
+   is used.  */
+
+void
+GOMP_enable_pinned_mode ()
+{
+  always_pinned_mode = true;
+}
+
+/* Verify that page-locked host memory is used for memory objects participating
+   in host <-> device memory transfers.  */
+
+static const bool verify_always_pinned_mode = false;
+
+static bool
+gomp_verify_always_pinned_mode (struct gomp_device_descr *device,
+                               const void *ptr, size_t size)
+{
+  gomp_debug (0, "%s: device=%p (%s), ptr=%p, size=%llu\n",
+             __FUNCTION__,
+             device, device->name, ptr, (unsigned long long) size);
+
+  if (size == 0)
+    /* Skip zero-size requests; for those we've got no actual region of
+       page-locked host memory.  */
+    ;
+  else if (device->page_locked_host_register_func)
+    {
+      int page_locked_host_p
+       = device->page_locked_host_p_func (device->target_id, ptr, size);
+      if (page_locked_host_p < 0)
+       {
+         gomp_error ("Failed to test page-locked host memory"
+                     " via %s libgomp plugin",
+                     device->name);
+         return false;
+       }
+      if (!page_locked_host_p)
+       {
+         gomp_error ("Failed page-locked host memory test");
+         return false;
+       }
+    }
+  return true;
+}
+
+
 /* Similar to gomp_realloc, but release register_lock before gomp_fatal.  */
 
 static void *
@@ -402,6 +470,9 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
                  if (__builtin_expect (aq != NULL, 0))
                    assert (ephemeral);
 
+                 /* We're just filling the CBUF; 'always_pinned_mode' isn't
+                    relevant.  */
+
                  memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0].start),
                          h, sz);
                  return;
@@ -422,18 +493,92 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep,
             stack local in a function that is no longer executing).  As we've
             not been able to use CBUF, make a copy of the data into a
             temporary buffer.  */
-         h_buf = gomp_malloc (sz);
+         if (always_pinned_mode)
+           {
+             h_buf = gomp_page_locked_host_alloc_dev (devicep, sz, false);
+             if (!h_buf)
+               {
+                 gomp_mutex_unlock (&devicep->lock);
+                 exit (EXIT_FAILURE);
+               }
+           }
+         else
+           h_buf = gomp_malloc (sz);
          memcpy (h_buf, h, sz);
        }
+
+      /* No 'gomp_verify_always_pinned_mode' for 'ephemeral'; have just
+        allocated.  */
+      if (!ephemeral
+         && verify_always_pinned_mode
+         && always_pinned_mode)
+       if (!gomp_verify_always_pinned_mode (devicep, h_buf, sz))
+         {
+           gomp_mutex_unlock (&devicep->lock);
+           exit (EXIT_FAILURE);
+         }
+
       goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func,
                               "dev", d, "host", h_buf, h, sz, aq);
+
       if (ephemeral)
-       /* Free once the transfer has completed.  */
-       devicep->openacc.async.queue_callback_func (aq, free, h_buf);
+       {
+         if (always_pinned_mode)
+           {
+             if (!gomp_page_locked_host_free_dev (devicep, h_buf, aq))
+               {
+                 gomp_mutex_unlock (&devicep->lock);
+                 exit (EXIT_FAILURE);
+               }
+           }
+         else
+           /* Free once the transfer has completed.  */
+           devicep->openacc.async.queue_callback_func (aq, free, h_buf);
+       }
     }
   else
-    gomp_device_copy (devicep, devicep->host2dev_func,
-                     "dev", d, "host", h, sz);
+    {
+      if (ephemeral
+         && always_pinned_mode)
+       {
+         /* TODO: Page-locking on the spot probably doesn't make a lot of
+            sense (performance-wise).  Should we instead use a "page-locked
+            host memory bounce buffer" (per host thread, or per device,
+            or...)?  */
+         void *ptr = (void *) h;
+         int page_locked_host_p
+           = gomp_page_locked_host_register_dev (devicep,
+                                                 ptr, sz, GOMP_MAP_TO);
+         if (page_locked_host_p < 0)
+           {
+             gomp_mutex_unlock (&devicep->lock);
+             exit (EXIT_FAILURE);
+           }
+         /* Ephemeral data isn't already page-locked host memory.  */
+         assert (page_locked_host_p);
+       }
+      else if (verify_always_pinned_mode
+              && always_pinned_mode)
+       if (!gomp_verify_always_pinned_mode (devicep, h, sz))
+         {
+           gomp_mutex_unlock (&devicep->lock);
+           exit (EXIT_FAILURE);
+         }
+
+      gomp_device_copy (devicep, devicep->host2dev_func,
+                       "dev", d, "host", h, sz);
+
+      if (ephemeral
+         && always_pinned_mode)
+       {
+         void *ptr = (void *) h;
+         if (!gomp_page_locked_host_unregister_dev (devicep, ptr, sz, aq))
+           {
+             gomp_mutex_unlock (&devicep->lock);
+             exit (EXIT_FAILURE);
+           }
+       }
+    }
 }
 
 attribute_hidden void
@@ -441,6 +586,14 @@ gomp_copy_dev2host (struct gomp_device_descr *devicep,
                    struct goacc_asyncqueue *aq,
                    void *h, const void *d, size_t sz)
 {
+  if (verify_always_pinned_mode
+      && always_pinned_mode)
+    if (!gomp_verify_always_pinned_mode (devicep, h, sz))
+      {
+       gomp_mutex_unlock (&devicep->lock);
+       exit (EXIT_FAILURE);
+      }
+
   if (__builtin_expect (aq != NULL, 0))
     goacc_device_copy_async (devicep, devicep->openacc.async.dev2host_func,
                             "host", h, "dev", d, NULL, sz, aq);
@@ -1367,8 +1520,19 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
        cbuf.chunk_cnt--;
       if (cbuf.chunk_cnt > 0)
        {
-         cbuf.buf
-           = malloc (cbuf.chunks[cbuf.chunk_cnt - 1].end - cbuf.chunks[0].start);
+         size_t sz
+           = cbuf.chunks[cbuf.chunk_cnt - 1].end - cbuf.chunks[0].start;
+         if (always_pinned_mode)
+           {
+             cbuf.buf = gomp_page_locked_host_alloc_dev (devicep, sz, false);
+             if (!cbuf.buf)
+               {
+                 gomp_mutex_unlock (&devicep->lock);
+                 exit (EXIT_FAILURE);
+               }
+           }
+         else
+           cbuf.buf = malloc (sz);
          if (cbuf.buf)
            {
              cbuf.tgt = tgt;
@@ -1671,6 +1835,23 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
                k->tgt = tgt;
                k->refcount = 0;
                k->dynamic_refcount = 0;
+               k->page_locked_host_p = false;
+               if (always_pinned_mode)
+                 {
+                   void *ptr = (void *) k->host_start;
+                   size_t size = k->host_end - k->host_start;
+                   int page_locked_host_p = 0;
+                   if (size != 0)
+                     page_locked_host_p = gomp_page_locked_host_register_dev
+                       (devicep, ptr, size, kind & typemask);
+                   if (page_locked_host_p < 0)
+                     {
+                       gomp_mutex_unlock (&devicep->lock);
+                       exit (EXIT_FAILURE);
+                     }
+                   if (page_locked_host_p)
+                     k->page_locked_host_p = true;
+                 }
                if (field_tgt_clear != FIELD_TGT_EMPTY)
                  {
                    k->tgt_offset = k->host_start - field_tgt_base
@@ -1976,11 +2157,22 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep,
                                                 - cbuf.chunks[0].start),
                            cbuf.chunks[c].end - cbuf.chunks[c].start,
                            false, NULL);
-      if (aq)
-       /* Free once the transfer has completed.  */
-       devicep->openacc.async.queue_callback_func (aq, free, cbuf.buf);
+      if (always_pinned_mode)
+       {
+         if (!gomp_page_locked_host_free_dev (devicep, cbuf.buf, aq))
+           {
+             gomp_mutex_unlock (&devicep->lock);
+             exit (EXIT_FAILURE);
+           }
+       }
       else
-       free (cbuf.buf);
+       {
+         if (aq)
+           /* Free once the transfer has completed.  */
+           devicep->openacc.async.queue_callback_func (aq, free, cbuf.buf);
+         else
+           free (cbuf.buf);
+       }
       cbuf.buf = NULL;
       cbufp = NULL;
     }
@@ -2112,6 +2304,23 @@ gomp_remove_var_internal (struct gomp_device_descr *devicep, splay_tree_key k,
          /* Starting from the _FIRST key, and continue for all following
             sibling keys.  */
          gomp_remove_splay_tree_key (&devicep->mem_map, k);
+
+         if (always_pinned_mode)
+           {
+             if (k->page_locked_host_p)
+               {
+                 void *ptr = (void *) k->host_start;
+                 size_t size = k->host_end - k->host_start;
+                 if (!gomp_page_locked_host_unregister_dev (devicep,
+                                                            ptr, size, aq))
+                   {
+                     gomp_mutex_unlock (&devicep->lock);
+                     exit (EXIT_FAILURE);
+                   }
+                 k->page_locked_host_p = false;
+               }
+           }
+
          if (REFCOUNT_STRUCTELEM_LAST_P (k->refcount))
            break;
          else
@@ -2119,7 +2328,25 @@ gomp_remove_var_internal (struct gomp_device_descr *devicep, splay_tree_key k,
        }
     }
   else
-    gomp_remove_splay_tree_key (&devicep->mem_map, k);
+    {
+      gomp_remove_splay_tree_key (&devicep->mem_map, k);
+
+      if (always_pinned_mode)
+       {
+         if (k->page_locked_host_p)
+           {
+             void *ptr = (void *) k->host_start;
+             size_t size = k->host_end - k->host_start;
+             if (!gomp_page_locked_host_unregister_dev (devicep,
+                                                        ptr, size, aq))
+               {
+                 gomp_mutex_unlock (&devicep->lock);
+                 exit (EXIT_FAILURE);
+               }
+             k->page_locked_host_p = false;
+           }
+       }
+    }
 
   if (aq)
     devicep->openacc.async.queue_callback_func (aq, gomp_unref_tgt_void,
@@ -2211,6 +2438,8 @@ gomp_unmap_vars_internal (struct target_mem_desc *tgt, bool do_copyfrom,
                                      + tgt->list[i].offset),
                            tgt->list[i].length);
       /* Queue all removals together for processing below.
+        We may unregister page-locked host memory only after all device to
+        host memory transfers have completed.
         See also 'gomp_exit_data'.  */
       if (do_remove)
        remove_vars[nrmvars++] = k;
@@ -2392,8 +2621,17 @@ get_gomp_offload_icvs (int dev_num)
   if (offload_icvs != NULL)
     return &offload_icvs->icvs;
 
-  struct gomp_offload_icv_list *new
-    = (struct gomp_offload_icv_list *) gomp_malloc (sizeof (struct gomp_offload_icv_list));
+  struct gomp_offload_icv_list *new;
+  size_t size = sizeof (struct gomp_offload_icv_list);
+  if (always_pinned_mode)
+    {
+      struct gomp_device_descr *device = &devices[dev_num];
+      new = gomp_page_locked_host_alloc_dev (device, size, false);
+      if (!new)
+       exit (EXIT_FAILURE);
+    }
+  else
+    new = gomp_malloc (size);
 
   new->device_num = dev_num;
   new->icvs.device_num = dev_num;
@@ -2447,6 +2685,8 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
                           const void *host_table, const void *target_data,
                           bool is_register_lock)
 {
+  gomp_debug (0, "%s: devicep=%p (%s)\n",
+             __FUNCTION__, devicep, devicep->name);
   void **host_func_table = ((void ***) host_table)[0];
   void **host_funcs_end  = ((void ***) host_table)[1];
   void **host_var_table  = ((void ***) host_table)[2];
@@ -2511,6 +2751,7 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
       k->refcount = REFCOUNT_INFINITY;
       k->dynamic_refcount = 0;
       k->aux = NULL;
+      k->page_locked_host_p = false;
       array->left = NULL;
       array->right = NULL;
       splay_tree_insert (&devicep->mem_map, array);
@@ -2556,6 +2797,34 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
       k->refcount = is_link_var ? REFCOUNT_LINK : REFCOUNT_INFINITY;
       k->dynamic_refcount = 0;
       k->aux = NULL;
+      k->page_locked_host_p = false;
+      if (always_pinned_mode)
+       {
+         void *ptr = (void *) k->host_start;
+         size_t size = k->host_end - k->host_start;
+         gomp_debug (0, "  var %d: ptr=%p, size=%llu, is_link_var=%d\n",
+                     i, ptr, (unsigned long long) size, is_link_var);
+         if (!is_link_var)
+           {
+             /* '#pragma omp declare target' variables typically are
+                read/write, but in particular artificial ones, like Fortran
+                array constructors, may be placed in section '.rodata'.
+                We don't have the actual mapping kind available here, so we
+                use a magic number.  */
+             const int kind = -1;
+             int page_locked_host_p = gomp_page_locked_host_register_dev
+               (devicep, ptr, size, kind);
+             if (page_locked_host_p < 0)
+               {
+                 gomp_mutex_unlock (&devicep->lock);
+                 if (is_register_lock)
+                   gomp_mutex_unlock (&register_lock);
+                 exit (EXIT_FAILURE);
+               }
+             if (page_locked_host_p)
+               k->page_locked_host_p = true;
+           }
+       }
       array->left = NULL;
       array->right = NULL;
       splay_tree_insert (&devicep->mem_map, array);
@@ -2577,6 +2846,13 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
             devicep->target_id.  */
          int dev_num = (int) (devicep - &devices[0]);
          struct gomp_offload_icvs *icvs = get_gomp_offload_icvs (dev_num);
+         if (!icvs)
+           {
+             gomp_mutex_unlock (&devicep->lock);
+             if (is_register_lock)
+               gomp_mutex_unlock (&register_lock);
+             gomp_fatal ("'get_gomp_offload_icvs' failed");
+           }
          size_t var_size = var->end - var->start;
          if (var_size != sizeof (struct gomp_offload_icvs))
            {
@@ -2599,6 +2875,8 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
          k->refcount = REFCOUNT_INFINITY;
          k->dynamic_refcount = 0;
          k->aux = NULL;
+         /* 'always_pinned_mode' handled via 'get_gomp_offload_icvs'.  */
+         k->page_locked_host_p = always_pinned_mode;
          array->left = NULL;
          array->right = NULL;
          splay_tree_insert (&devicep->mem_map, array);
@@ -3261,6 +3539,12 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
 
   flags = clear_unsupported_flags (devicep, flags);
 
+  /* For 'nowait' we supposedly have to unregister/free page-locked host memory
+     via 'GOMP_PLUGIN_target_task_completion'.  There is no current
+     configuration exercising this (and thus, infeasible to test).  */
+  assert (!(flags & GOMP_TARGET_FLAG_NOWAIT)
+         || !(devicep && devicep->page_locked_host_register_func));
+
   if (flags & GOMP_TARGET_FLAG_NOWAIT)
     {
       struct gomp_thread *thr = gomp_thread ();
@@ -3572,18 +3856,37 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
     }
   else
     {
-      devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
-      sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
-      kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned short));
+      size_t devaddrs_size = mapnum * sizeof (uint64_t);
+      size_t sizes_size = mapnum * sizeof (uint64_t);
+      size_t kinds_size = mapnum * sizeof (unsigned short);
+      if (always_pinned_mode)
+       {
+         if (!(devaddrs = gomp_page_locked_host_alloc_dev (devicep,
+                                                           devaddrs_size,
+                                                           false))
+             || !(sizes = gomp_page_locked_host_alloc_dev (devicep,
+                                                           sizes_size,
+                                                           false))
+             || !(kinds = gomp_page_locked_host_alloc_dev (devicep,
+                                                           kinds_size,
+                                                           false)))
+           exit (EXIT_FAILURE);
+       }
+      else
+       {
+         devaddrs = gomp_malloc (devaddrs_size);
+         sizes = gomp_malloc (sizes_size);
+         kinds = gomp_malloc (kinds_size);
+       }
       gomp_copy_dev2host (devicep, aq, devaddrs,
                          (const void *) (uintptr_t) devaddrs_ptr,
-                         mapnum * sizeof (uint64_t));
+                         devaddrs_size);
       gomp_copy_dev2host (devicep, aq, sizes,
                          (const void *) (uintptr_t) sizes_ptr,
-                         mapnum * sizeof (uint64_t));
+                         sizes_size);
       gomp_copy_dev2host (devicep, aq, kinds,
                          (const void *) (uintptr_t) kinds_ptr,
-                         mapnum * sizeof (unsigned short));
+                         kinds_size);
       if (aq && !devicep->openacc.async.synchronize_func (aq))
        exit (EXIT_FAILURE);
     }
@@ -3598,7 +3901,23 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
 
   if (tgt_align)
     {
-      char *tgt = gomp_alloca (tgt_size + tgt_align - 1);
+      size_t tgt_alloc_size = tgt_size + tgt_align - 1;
+      char *tgt = gomp_alloca (tgt_alloc_size);
+      if (always_pinned_mode)
+       {
+         /* TODO: See 'gomp_copy_host2dev' re "page-locking on the spot".
+            On the other hand, performance isn't really a concern, here.  */
+         int page_locked_host_p = 0;
+         if (tgt_alloc_size != 0)
+           {
+             page_locked_host_p = gomp_page_locked_host_register_dev
+               (devicep, tgt, tgt_alloc_size, GOMP_MAP_TOFROM);
+             if (page_locked_host_p < 0)
+               exit (EXIT_FAILURE);
+             /* 'gomp_alloca' isn't already page-locked host memory.  */
+             assert (page_locked_host_p);
+           }
+       }
       uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
       if (al)
        tgt += tgt_align - al;
@@ -3632,6 +3951,14 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
                ++i;
              }
          }
+      if (always_pinned_mode)
+       {
+         if (tgt_alloc_size != 0
+             && !gomp_page_locked_host_unregister_dev (devicep,
+                                                       tgt, tgt_alloc_size,
+                                                       NULL))
+           exit (EXIT_FAILURE);
+       }
     }
 
   if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) && mapnum > 0)
@@ -3718,9 +4045,20 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
                  {
                    cdata[i].aligned = true;
                    size_t align = (size_t) 1 << (kinds[i] >> 8);
-                   devaddrs[i]
-                     = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
-                                                                  sizes[i]);
+                   void *ptr;
+                   if (always_pinned_mode)
+                     {
+                       ptr = gomp_page_locked_host_aligned_alloc_dev
+                         (devicep, align, sizes[i]);
+                       if (!ptr)
+                         {
+                           gomp_mutex_unlock (&devicep->lock);
+                           exit (EXIT_FAILURE);
+                         }
+                     }
+                   else
+                     ptr = gomp_aligned_alloc (align, sizes[i]);
+                   devaddrs[i] = (uint64_t) (uintptr_t) ptr;
                  }
                else if (n2 != NULL)
                  devaddrs[i] = (n2->host_start + cdata[i].devaddr
@@ -3770,7 +4108,23 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
                      }
                  }
                if (!cdata[i].present)
-                 devaddrs[i] = (uintptr_t) gomp_malloc (sizeof (void*));
+                 {
+                   void *ptr;
+                   size_t size = sizeof (void *);
+                   if (always_pinned_mode)
+                     {
+                       ptr = gomp_page_locked_host_alloc_dev (devicep,
+                                                              size, false);
+                       if (!ptr)
+                         {
+                           gomp_mutex_unlock (&devicep->lock);
+                           exit (EXIT_FAILURE);
+                         }
+                     }
+                   else
+                     ptr = gomp_malloc (size);
+                   devaddrs[i] = (uintptr_t) ptr;
+                 }
                /* Assume that when present, the pointer is already correct.  */
                if (!n2)
                  *(uint64_t *) (uintptr_t) (devaddrs[i] + sizes[i])
@@ -3803,9 +4157,20 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
                  {
                    cdata[i].aligned = true;
                    size_t align = (size_t) 1 << (kinds[i] >> 8);
-                   devaddrs[i]
-                     = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
-                                                                  sizes[i]);
+                   void *ptr;
+                   if (always_pinned_mode)
+                     {
+                       ptr = gomp_page_locked_host_aligned_alloc_dev
+                         (devicep, align, sizes[i]);
+                       if (!ptr)
+                         {
+                           gomp_mutex_unlock (&devicep->lock);
+                           exit (EXIT_FAILURE);
+                         }
+                     }
+                   else
+                     ptr = gomp_aligned_alloc (align, sizes[i]);
+                   devaddrs[i] = (uint64_t) (uintptr_t) ptr;
                    gomp_copy_dev2host (devicep, aq,
                                        (void *) (uintptr_t) devaddrs[i],
                                        (void *) (uintptr_t) cdata[i].devaddr,
@@ -3881,7 +4246,20 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
                                          + sizes[i + sizes[i]]);
                    size_t align = (size_t) 1 << (kinds[i] >> 8);
                    cdata[i].aligned = true;
-                   devaddrs[i] = (uintptr_t) gomp_aligned_alloc (align, sz);
+                   void *ptr;
+                   if (always_pinned_mode)
+                     {
+                       ptr = gomp_page_locked_host_aligned_alloc_dev
+                         (devicep, align, sz);
+                       if (!ptr)
+                         {
+                           gomp_mutex_unlock (&devicep->lock);
+                           exit (EXIT_FAILURE);
+                         }
+                     }
+                   else
+                     ptr = gomp_aligned_alloc (align, sz);
+                   devaddrs[i] = (uintptr_t) ptr;
                    devaddrs[i] -= devaddrs[i+1] - cdata[i].devaddr;
                  }
                else
@@ -3945,9 +4323,29 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
              struct_cpy = sizes[i];
            }
          else if (!cdata[i].present && cdata[i].aligned)
-           gomp_aligned_free ((void *) (uintptr_t) devaddrs[i]);
+           {
+             void *ptr = (void *) (uintptr_t) devaddrs[i];
+             if (always_pinned_mode)
+               {
+                 if (!gomp_page_locked_host_aligned_free_dev (devicep,
+                                                              ptr,
+                                                              aq))
+                   exit (EXIT_FAILURE);
+               }
+             else
+               gomp_aligned_free (ptr);
+           }
          else if (!cdata[i].present)
-           free ((void *) (uintptr_t) devaddrs[i]);
+           {
+             void *ptr = (void *) (uintptr_t) devaddrs[i];
+             if (always_pinned_mode)
+               {
+                 if (!gomp_page_locked_host_free_dev (devicep, ptr, aq))
+                   exit (EXIT_FAILURE);
+               }
+             else
+               free (ptr);
+           }
        }
       if (clean_struct)
        for (uint64_t i = 0; i < mapnum; i++)
@@ -3956,12 +4354,30 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
                  == GOMP_MAP_STRUCT))
            {
              devaddrs[i] += cdata[i+1].devaddr - cdata[i].devaddr;
-             gomp_aligned_free ((void *) (uintptr_t) devaddrs[i]);
+             void *ptr = (void *) (uintptr_t) devaddrs[i];
+             if (always_pinned_mode)
+               {
+                 if (!gomp_page_locked_host_aligned_free_dev (devicep,
+                                                              ptr, aq))
+                   exit (EXIT_FAILURE);
+               }
+             else
+               gomp_aligned_free (ptr);
            }
 
-      free (devaddrs);
-      free (sizes);
-      free (kinds);
+      if (always_pinned_mode)
+       {
+         if (!gomp_page_locked_host_free_dev (devicep, devaddrs, aq)
+             || !gomp_page_locked_host_free_dev (devicep, sizes, aq)
+             || !gomp_page_locked_host_free_dev (devicep, kinds, aq))
+           exit (EXIT_FAILURE);
+       }
+      else
+       {
+         free (devaddrs);
+         free (sizes);
+         free (kinds);
+       }
     }
 }
 
@@ -4585,6 +5001,160 @@ gomp_usm_free (void *device_ptr, int device_num)
 }
 
 
+/* Allocate page-locked host memory via DEVICE.  */
+
+static void *
+gomp_page_locked_host_alloc_dev (struct gomp_device_descr *device,
+                                size_t size, bool allow_null)
+{
+  gomp_debug (0, "%s: device=%p (%s), size=%llu\n",
+             __FUNCTION__, device, device->name, (unsigned long long) size);
+
+  void *ret;
+  if (!device->page_locked_host_alloc_func (&ret, size))
+    {
+      const char *fmt
+       = "Failed to allocate page-locked host memory via %s libgomp plugin";
+      if (allow_null)
+       gomp_fatal (fmt, device->name);
+      else
+       gomp_error (fmt, device->name);
+      ret = NULL;
+    }
+  else if (ret == NULL && !allow_null)
+    gomp_error ("Out of memory allocating %lu bytes"
+               " page-locked host memory"
+               " via %s libgomp plugin",
+               (unsigned long) size, device->name);
+  else
+    gomp_debug (0, "  -> ret=[%p, %p)\n",
+               ret, ret + size);
+  return ret;
+}
+
+/* Free page-locked host memory via DEVICE.  */
+
+static bool
+gomp_page_locked_host_free_dev (struct gomp_device_descr *device,
+                               void *ptr,
+                               struct goacc_asyncqueue *aq)
+{
+  gomp_debug (0, "%s: device=%p (%s), ptr=%p, aq=%p\n",
+             __FUNCTION__, device, device->name, ptr, aq);
+
+  if (!device->page_locked_host_free_func (ptr, aq))
+    {
+      gomp_error ("Failed to free page-locked host memory"
+                 " via %s libgomp plugin",
+                 device->name);
+      return false;
+    }
+  return true;
+}
+
+/* Allocate aligned page-locked host memory via DEVICE.
+
+   That is, 'gomp_aligned_alloc' (see 'alloc.c') for page-locked host
+   memory.  */
+
+static void *
+gomp_page_locked_host_aligned_alloc_dev (struct gomp_device_descr *device,
+                                        size_t al, size_t size)
+{
+  gomp_debug (0, "%s: device=%p (%s), al=%llu, size=%llu\n",
+             __FUNCTION__, device, device->name,
+             (unsigned long long) al, (unsigned long long) size);
+
+  void *ret;
+  if (al < sizeof (void *))
+    al = sizeof (void *);
+  ret = NULL;
+  if ((al & (al - 1)) == 0 && size)
+    {
+      void *p = gomp_page_locked_host_alloc_dev (device, size + al, true);
+      if (p)
+       {
+         void *ap = (void *) (((uintptr_t) p + al) & -al);
+         ((void **) ap)[-1] = p;
+         ret = ap;
+       }
+    }
+  if (ret == NULL)
+    gomp_error ("Out of memory allocating %lu bytes", (unsigned long) size);
+  else
+    gomp_debug (0, "  -> ret=[%p, %p)\n",
+               ret, ret + size);
+  return ret;
+}
+
+/* Free aligned page-locked host memory via DEVICE.
+
+   That is, 'gomp_aligned_free' (see 'alloc.c') for page-locked host
+   memory.  */
+
+static bool
+gomp_page_locked_host_aligned_free_dev (struct gomp_device_descr *device,
+                                       void *ptr,
+                                       struct goacc_asyncqueue *aq)
+{
+  gomp_debug (0, "%s: device=%p (%s), ptr=%p, aq=%p\n",
+             __FUNCTION__, device, device->name, ptr, aq);
+
+  if (ptr)
+    {
+      ptr = ((void **) ptr)[-1];
+      gomp_debug (0, "  ptr=%p\n",
+                 ptr);
+
+      if (!gomp_page_locked_host_free_dev (device, ptr, aq))
+       return false;
+    }
+  return true;
+}
+
+/* Register page-locked host memory via DEVICE.  */
+
+attribute_hidden int
+gomp_page_locked_host_register_dev (struct gomp_device_descr *device,
+                                   void *ptr, size_t size, int kind)
+{
+  gomp_debug (0, "%s: device=%p (%s), ptr=%p, size=%llu, kind=%d\n",
+             __FUNCTION__, device, device->name,
+             ptr, (unsigned long long) size, kind);
+  assert (size != 0);
+
+  int ret = device->page_locked_host_register_func (device->target_id,
+                                                   ptr, size, kind);
+  if (ret < 0)
+    gomp_error ("Failed to register page-locked host memory"
+               " via %s libgomp plugin",
+               device->name);
+  return ret;
+}
+
+/* Unregister page-locked host memory via DEVICE.  */
+
+attribute_hidden bool
+gomp_page_locked_host_unregister_dev (struct gomp_device_descr *device,
+                                     void *ptr, size_t size,
+                                     struct goacc_asyncqueue *aq)
+{
+  gomp_debug (0, "%s: device=%p (%s), ptr=%p, size=%llu, aq=%p\n",
+             __FUNCTION__, device, device->name,
+             ptr, (unsigned long long) size, aq);
+  assert (size != 0);
+
+  if (!device->page_locked_host_unregister_func (ptr, size, aq))
+    {
+      gomp_error ("Failed to unregister page-locked host memory"
+                 " via %s libgomp plugin",
+                 device->name);
+      return false;
+    }
+  return true;
+}
+
+
 /* Device (really: libgomp plugin) to use for paged-locked memory.  We
    assume there is either none or exactly one such device for the lifetime of
    the process.  */
@@ -4681,10 +5251,7 @@ gomp_page_locked_host_alloc (void **ptr, size_t size)
        }
       gomp_mutex_unlock (&device->lock);
 
-      if (!device->page_locked_host_alloc_func (ptr, size))
-       gomp_fatal ("Failed to allocate page-locked host memory"
-                   " via %s libgomp plugin",
-                   device->name);
+      *ptr = gomp_page_locked_host_alloc_dev (device, size, true);
     }
   return device != NULL;
 }
@@ -4713,10 +5280,8 @@ gomp_page_locked_host_free (void *ptr)
     }
   gomp_mutex_unlock (&device->lock);
 
-  if (!device->page_locked_host_free_func (ptr))
-    gomp_fatal ("Failed to free page-locked host memory"
-               " via %s libgomp plugin",
-               device->name);
+  if (!gomp_page_locked_host_free_dev (device, ptr, NULL))
+    exit (EXIT_FAILURE);
 }
 
 
@@ -4792,30 +5357,84 @@ omp_target_memcpy_copy (void *dst, const void *src, size_t length,
   bool ret;
   if (src_devicep == NULL && dst_devicep == NULL)
     {
+      /* No 'gomp_verify_always_pinned_mode' here.  */
       memcpy ((char *) dst + dst_offset, (char *) src + src_offset, length);
       return 0;
     }
   if (src_devicep == NULL)
     {
       gomp_mutex_lock (&dst_devicep->lock);
+
+      void *src_ptr = (void *) src + src_offset;
+      int src_ptr_page_locked_host_p = 0;
+
+      if (always_pinned_mode)
+       {
+         if (length != 0)
+           src_ptr_page_locked_host_p = gomp_page_locked_host_register_dev
+             (dst_devicep, src_ptr, length, GOMP_MAP_TO);
+         if (src_ptr_page_locked_host_p < 0)
+           {
+             gomp_mutex_unlock (&dst_devicep->lock);
+             return ENOMEM;
+           }
+       }
+
+      /* No 'gomp_verify_always_pinned_mode' here; have just registered.  */
       ret = dst_devicep->host2dev_func (dst_devicep->target_id,
                                        (char *) dst + dst_offset,
-                                       (char *) src + src_offset, length);
+                                       src_ptr, length);
+
+      if (src_ptr_page_locked_host_p
+         && !gomp_page_locked_host_unregister_dev (dst_devicep,
+                                                   src_ptr, length, NULL))
+           {
+             gomp_mutex_unlock (&dst_devicep->lock);
+             return ENOMEM;
+           }
+
       gomp_mutex_unlock (&dst_devicep->lock);
       return (ret ? 0 : EINVAL);
     }
   if (dst_devicep == NULL)
     {
       gomp_mutex_lock (&src_devicep->lock);
+
+      void *dst_ptr = (void *) dst + dst_offset;
+      int dst_ptr_page_locked_host_p = 0;
+
+      if (always_pinned_mode)
+       {
+         if (length != 0)
+           dst_ptr_page_locked_host_p = gomp_page_locked_host_register_dev
+             (src_devicep, dst_ptr, length, GOMP_MAP_FROM);
+         if (dst_ptr_page_locked_host_p < 0)
+           {
+             gomp_mutex_unlock (&src_devicep->lock);
+             return ENOMEM;
+           }
+       }
+
+      /* No 'gomp_verify_always_pinned_mode' here; have just registered.  */
       ret = src_devicep->dev2host_func (src_devicep->target_id,
-                                       (char *) dst + dst_offset,
+                                       dst_ptr,
                                        (char *) src + src_offset, length);
+
+      if (dst_ptr_page_locked_host_p
+         && !gomp_page_locked_host_unregister_dev (src_devicep,
+                                                   dst_ptr, length, NULL))
+           {
+             gomp_mutex_unlock (&src_devicep->lock);
+             return ENOMEM;
+           }
+
       gomp_mutex_unlock (&src_devicep->lock);
       return (ret ? 0 : EINVAL);
     }
   if (src_devicep == dst_devicep)
     {
       gomp_mutex_lock (&src_devicep->lock);
+      /* No 'gomp_verify_always_pinned_mode' here.  */
       ret = src_devicep->dev2dev_func (src_devicep->target_id,
                                       (char *) dst + dst_offset,
                                       (char *) src + src_offset, length);
@@ -4927,21 +5546,63 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
        return EINVAL;
       if (dst_devicep == NULL && src_devicep == NULL)
        {
+         /* No 'gomp_verify_always_pinned_mode' here.  */
          memcpy ((char *) dst + dst_off, (const char *) src + src_off,
                  length);
          ret = 1;
        }
       else if (src_devicep == NULL)
-       ret = dst_devicep->host2dev_func (dst_devicep->target_id,
-                                         (char *) dst + dst_off,
-                                         (const char *) src + src_off,
-                                         length);
+       {
+         void *src_ptr = (void *) src + src_off;
+         int src_ptr_page_locked_host_p = 0;
+
+         if (always_pinned_mode)
+           {
+             if (length != 0)
+               src_ptr_page_locked_host_p = gomp_page_locked_host_register_dev
+                 (dst_devicep, src_ptr, length, GOMP_MAP_TO);
+             if (src_ptr_page_locked_host_p < 0)
+               return ENOMEM;
+           }
+
+         /* No 'gomp_verify_always_pinned_mode' here; have just registered.  */
+         ret = dst_devicep->host2dev_func (dst_devicep->target_id,
+                                           (char *) dst + dst_off,
+                                           src_ptr,
+                                           length);
+
+         if (src_ptr_page_locked_host_p
+             && !gomp_page_locked_host_unregister_dev (dst_devicep,
+                                                       src_ptr, length, NULL))
+           return ENOMEM;
+       }
       else if (dst_devicep == NULL)
-       ret = src_devicep->dev2host_func (src_devicep->target_id,
-                                         (char *) dst + dst_off,
-                                         (const char *) src + src_off,
-                                         length);
+       {
+         void *dst_ptr = (void *) dst + dst_off;
+         int dst_ptr_page_locked_host_p = 0;
+
+         if (always_pinned_mode)
+           {
+             if (length != 0)
+               dst_ptr_page_locked_host_p = gomp_page_locked_host_register_dev
+                 (src_devicep, dst_ptr, length, GOMP_MAP_FROM);
+             if (dst_ptr_page_locked_host_p < 0)
+               return ENOMEM;
+           }
+
+         /* No 'gomp_verify_always_pinned_mode' here; have just registered.  */
+         ret = src_devicep->dev2host_func (src_devicep->target_id,
+                                           dst_ptr,
+                                           (const char *) src + src_off,
+                                           length);
+
+         if (dst_ptr_page_locked_host_p
+             && !gomp_page_locked_host_unregister_dev (src_devicep,
+                                                       dst_ptr, length, NULL))
+           return ENOMEM;
+       }
       else if (src_devicep == dst_devicep)
+       /* No 'gomp_verify_always_pinned_mode' here.  */
        ret = src_devicep->dev2dev_func (src_devicep->target_id,
                                         (char *) dst + dst_off,
                                         (const char *) src + src_off,
@@ -5184,6 +5845,7 @@ omp_target_associate_ptr (const void *host_ptr, const void *device_ptr,
       k->refcount = REFCOUNT_INFINITY;
       k->dynamic_refcount = 0;
       k->aux = NULL;
+      k->page_locked_host_p = false;
       array->left = NULL;
       array->right = NULL;
       splay_tree_insert (&devicep->mem_map, array);
@@ -5406,6 +6068,9 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
   DLSYM_OPT (is_usm_ptr, is_usm_ptr);
   DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc);
   DLSYM_OPT (page_locked_host_free, page_locked_host_free);
+  DLSYM_OPT (page_locked_host_register, page_locked_host_register);
+  DLSYM_OPT (page_locked_host_unregister, page_locked_host_unregister);
+  DLSYM_OPT (page_locked_host_p, page_locked_host_p);
   DLSYM (dev2host);
   DLSYM (host2dev);
   DLSYM (evaluate_device);
diff --git a/libgomp/testsuite/libgomp.c/alloc-pinned-7.c b/libgomp/testsuite/libgomp.c/alloc-pinned-7.c
deleted file mode 100644 (file)
index 8dc1905..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/* { dg-do run } */
-/* { dg-additional-options "-foffload-memory=pinned" } */
-
-/* { dg-xfail-run-if "Pinning not implemented on this host" { ! *-*-linux-gnu } } */
-
-/* Test that pinned memory works.  */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#ifdef __linux__
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <sys/mman.h>
-
-int
-get_pinned_mem ()
-{
-  int pid = getpid ();
-  char buf[100];
-  sprintf (buf, "/proc/%d/status", pid);
-
-  FILE *proc = fopen (buf, "r");
-  if (!proc)
-    abort ();
-  while (fgets (buf, 100, proc))
-    {
-      int val;
-      if (sscanf (buf, "VmLck: %d", &val))
-       {
-         fclose (proc);
-         return val;
-       }
-    }
-  abort ();
-}
-#else
-int
-get_pinned_mem ()
-{
-  return 0;
-}
-
-#define mlockall(...) 0
-#endif
-
-#include <omp.h>
-
-int
-main ()
-{
-  // Sanity check
-  if (get_pinned_mem () == 0)
-    {
-      /* -foffload-memory=pinned has failed, but maybe that's because
-        isufficient pinned memory was available.  */
-      if (mlockall (MCL_CURRENT | MCL_FUTURE) == 0)
-       abort ();
-    }
-
-  return 0;
-}