From: Andrew Stubbs Date: Fri, 19 Jul 2019 11:00:53 +0000 (+0100) Subject: Move offload data into GPU memory. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=bb76aa75296c99ffb79064694165494cc7119eed;p=thirdparty%2Fgcc.git Move offload data into GPU memory. 2019-09-09 Andrew Stubbs gcc/ * config/gcn/gcn-run.c (heap_region): New global variable. (struct hsa_runtime_fn_info): Add hsa_memory_assign_agent_fn. (init_hsa_runtime_functions): Initialize hsa_memory_assign_agent. (get_kernarg_region): Move contents to .... (get_memory_region): .... here. (get_heap_region): New function. (init_device): Initialize the heap_region. (device_malloc): Add region parameter. (struct kernargs): Move heap .... (heap): ... to global scope. (main): Allocate heap separate to kernargs. libgomp/ * plugin/plugin-gcn.c (struct hsa_runtime_fn_info): Add hsa_memory_assign_agent_fn. (struct agent_info): Add data_region. (init_hsa_runtime_functions): Initialize hsa_memory_assign_agent. (get_kernarg_memory_region): Move contents to new function ... (get_memory_region): ... here. (get_data_memory_region): New function. (GOMP_OFFLOAD_get_property): Use data_region, not kernarg_region. (GOMP_OFFLOAD_init_device): Initialize data_region. (create_and_finalize_hsa_program): Use data_region, not kernarg_region, and assign heap to device agent. (GOMP_OFFLOAD_alloc_by_agent): Likewise. (image_address_p): Delete function. (struct copy_data): Remove use_hsa_memory_copy. (copy_data): Always use hsa_memory_copy. (queue_push_copy): Remove use_hsa_memory_copy. (GOMP_OFFLOAD_dev2host): Always use hsa_memory_copy. (GOMP_OFFLOAD_host2dev): Likewise. (GOMP_OFFLOAD_dev2dev): Likewise. (gcn_exec): Use hsa_memory_copy. (GOMP_OFFLOAD_openacc_async_host2dev): Always use hsa_memory_copy. (GOMP_OFFLOAD_openacc_async_dev2host): Likewise. (cherry picked from openacc-gcc-9-branch commit cd5b623df8eccfdf81930414765e2c1f1ecaab9a) --- diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp index 80550f77448f..c44cffb10413 100644 --- a/gcc/ChangeLog.omp +++ b/gcc/ChangeLog.omp @@ -1,3 +1,17 @@ +2019-09-09 Andrew Stubbs + + * config/gcn/gcn-run.c (heap_region): New global variable. + (struct hsa_runtime_fn_info): Add hsa_memory_assign_agent_fn. + (init_hsa_runtime_functions): Initialize hsa_memory_assign_agent. + (get_kernarg_region): Move contents to .... + (get_memory_region): .... here. + (get_heap_region): New function. + (init_device): Initialize the heap_region. + (device_malloc): Add region parameter. + (struct kernargs): Move heap .... + (heap): ... to global scope. + (main): Allocate heap separate to kernargs. + 2019-09-06 Andrew Stubbs Backport from mainline: diff --git a/gcc/config/gcn/gcn-run.c b/gcc/config/gcn/gcn-run.c index 84718f428460..65c5bc30fb73 100644 --- a/gcc/config/gcn/gcn-run.c +++ b/gcc/config/gcn/gcn-run.c @@ -72,6 +72,7 @@ uint64_t main_kernel = 0; hsa_executable_t executable = { 0 }; hsa_region_t kernargs_region = { 0 }; +hsa_region_t heap_region = { 0 }; uint32_t kernarg_segment_size = 0; uint32_t group_segment_size = 0; uint32_t private_segment_size = 0; @@ -135,6 +136,8 @@ struct hsa_runtime_fn_info hsa_signal_t *signal); hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size, void **ptr); + hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent, + hsa_access_permission_t access); hsa_status_t (*hsa_memory_copy_fn) (void *dst, const void *src, size_t size); hsa_status_t (*hsa_memory_free_fn) (void *ptr); @@ -204,6 +207,7 @@ init_hsa_runtime_functions (void) DLSYM_FN (hsa_executable_freeze) DLSYM_FN (hsa_signal_create) DLSYM_FN (hsa_memory_allocate) + DLSYM_FN (hsa_memory_assign_agent) DLSYM_FN (hsa_memory_copy) DLSYM_FN (hsa_memory_free) DLSYM_FN (hsa_signal_destroy) @@ -282,7 +286,8 @@ get_gpu_agent (hsa_agent_t agent, void *data __attribute__ ((unused))) suitable one has been found. */ static hsa_status_t -get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused))) +get_memory_region (hsa_region_t region, hsa_region_t *retval, + hsa_region_global_flag_t kind) { /* Reject non-global regions. */ hsa_region_segment_t segment; @@ -294,9 +299,9 @@ get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused))) hsa_region_global_flag_t flags; hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS, &flags); - if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) + if (flags & kind) { - kernargs_region = region; + *retval = region; return HSA_STATUS_INFO_BREAK; } @@ -304,6 +309,20 @@ get_kernarg_region (hsa_region_t region, void *data __attribute__ ((unused))) return HSA_STATUS_SUCCESS; } +static hsa_status_t +get_kernarg_region (hsa_region_t region, void *data __attribute__((unused))) +{ + return get_memory_region (region, &kernargs_region, + HSA_REGION_GLOBAL_FLAG_KERNARG); +} + +static hsa_status_t +get_heap_region (hsa_region_t region, void *data __attribute__((unused))) +{ + return get_memory_region (region, &heap_region, + HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED); +} + /* Initialize the HSA Runtime library and GPU device. */ static void @@ -338,6 +357,13 @@ init_device () NULL), status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, "Locate kernargs memory"); + + /* Select a memory region for the kernel heap. + The call-back function, get_heap_region, does the selection. */ + XHSA_CMP (hsa_fns.hsa_agent_iterate_regions_fn (device, get_heap_region, + NULL), + status == HSA_STATUS_SUCCESS || status == HSA_STATUS_INFO_BREAK, + "Locate device memory"); } @@ -593,10 +619,10 @@ found_main:; __flat_scalar GCN address space). */ static void * -device_malloc (size_t size) +device_malloc (size_t size, hsa_region_t region) { void *result; - XHSA (hsa_fns.hsa_memory_allocate_fn (kernargs_region, size, &result), + XHSA (hsa_fns.hsa_memory_allocate_fn (region, size, &result), "Allocate device memory"); return result; } @@ -634,14 +660,14 @@ struct kernargs } queue[1024]; unsigned int consumed; } output_data; - - struct heap - { - int64_t size; - char data[0]; - } heap; }; +struct heap +{ + int64_t size; + char data[0]; +} heap; + /* Print any console output from the kernel. We print all entries from "consumed" to the next entry without a "written" flag, or "next_output" is reached. The buffer is circular, but the @@ -811,13 +837,19 @@ main (int argc, char *argv[]) /* Allocate device memory for both function parameters and the argv data. */ - size_t heap_size = 10 * 1024 * 1024; /* 10MB. */ - struct kernargs *kernargs = device_malloc (sizeof (*kernargs) + heap_size); + struct kernargs *kernargs = device_malloc (sizeof (*kernargs), + kernargs_region); struct argdata { int64_t argv_data[kernel_argc]; char strings[args_size]; - } *args = device_malloc (sizeof (struct argdata)); + } *args = device_malloc (sizeof (struct argdata), kernargs_region); + + size_t heap_size = 10 * 1024 * 1024; /* 10MB. */ + struct heap *heap = device_malloc (heap_size, heap_region); + XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device, + HSA_ACCESS_PERMISSION_RW), + "Assign heap to device agent"); /* Write the data to the target. */ kernargs->argc = kernel_argc; @@ -837,8 +869,8 @@ main (int argc, char *argv[]) memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1); offset += arg_len; } - kernargs->heap_ptr = (int64_t) &kernargs->heap; - kernargs->heap.size = heap_size; + kernargs->heap_ptr = (int64_t) heap; + hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size)); /* Run constructors on the GPU. */ run (init_array_kernel, kernargs); diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp index 8ed0a10a5897..92a3048dd97f 100644 --- a/libgomp/ChangeLog.omp +++ b/libgomp/ChangeLog.omp @@ -1,3 +1,28 @@ +2019-09-09 Andrew Stubbs + + * plugin/plugin-gcn.c (struct hsa_runtime_fn_info): Add + hsa_memory_assign_agent_fn. + (struct agent_info): Add data_region. + (init_hsa_runtime_functions): Initialize hsa_memory_assign_agent. + (get_kernarg_memory_region): Move contents to new function ... + (get_memory_region): ... here. + (get_data_memory_region): New function. + (GOMP_OFFLOAD_get_property): Use data_region, not kernarg_region. + (GOMP_OFFLOAD_init_device): Initialize data_region. + (create_and_finalize_hsa_program): Use data_region, not + kernarg_region, and assign heap to device agent. + (GOMP_OFFLOAD_alloc_by_agent): Likewise. + (image_address_p): Delete function. + (struct copy_data): Remove use_hsa_memory_copy. + (copy_data): Always use hsa_memory_copy. + (queue_push_copy): Remove use_hsa_memory_copy. + (GOMP_OFFLOAD_dev2host): Always use hsa_memory_copy. + (GOMP_OFFLOAD_host2dev): Likewise. + (GOMP_OFFLOAD_dev2dev): Likewise. + (gcn_exec): Use hsa_memory_copy. + (GOMP_OFFLOAD_openacc_async_host2dev): Always use hsa_memory_copy. + (GOMP_OFFLOAD_openacc_async_dev2host): Likewise. + 2019-09-06 Julian Brown * plugin/plugin-gcn.c (GOMP_OFFLOAD_alloc_by_agent, diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 2f273967bade..2d46a876e6c8 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -149,6 +149,8 @@ struct hsa_runtime_fn_info hsa_signal_t *signal); hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size, void **ptr); + hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent, + hsa_access_permission_t access); hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size); hsa_status_t (*hsa_memory_free_fn) (void *ptr); hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal); @@ -629,6 +631,9 @@ struct agent_info /* The HSA memory region from which to allocate kernel arguments. */ hsa_region_t kernarg_region; + /* The HSA memory region from which to allocate device data. */ + hsa_region_t data_region; + /* Read-write lock that protects kernels which are running or about to be run from interference with loading and unloading of images. Needs to be locked for reading while a kernel is being run, and for writing if the @@ -693,6 +698,7 @@ init_hsa_runtime_functions (void) DLSYM_FN (hsa_executable_freeze) DLSYM_FN (hsa_signal_create) DLSYM_FN (hsa_memory_allocate) + DLSYM_FN (hsa_memory_assign_agent) DLSYM_FN (hsa_memory_copy) DLSYM_FN (hsa_memory_free) DLSYM_FN (hsa_signal_destroy) @@ -1293,11 +1299,12 @@ find_executable_symbol (hsa_executable_t executable, } /* Callback of hsa_agent_iterate_regions. Determine if a memory REGION can be - used for kernarg allocations and if so write it to the memory pointed to by + used for allocations of KIND and if so write it to the memory pointed to by DATA and break the query. */ static hsa_status_t -get_kernarg_memory_region (hsa_region_t region, void *data) +get_memory_region (hsa_region_t region, hsa_region_t *retval, + hsa_region_global_flag_t kind) { hsa_status_t status; hsa_region_segment_t segment; @@ -1314,15 +1321,28 @@ get_kernarg_memory_region (hsa_region_t region, void *data) &flags); if (status != HSA_STATUS_SUCCESS) return status; - if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) + if (flags & kind) { - hsa_region_t *ret = (hsa_region_t *) data; - *ret = region; + *retval = region; return HSA_STATUS_INFO_BREAK; } return HSA_STATUS_SUCCESS; } +static hsa_status_t +get_kernarg_memory_region (hsa_region_t region, void *data) +{ + return get_memory_region (region, (hsa_region_t *)data, + HSA_REGION_GLOBAL_FLAG_KERNARG); +} + +static hsa_status_t +get_data_memory_region (hsa_region_t region, void *data) +{ + return get_memory_region (region, (hsa_region_t *)data, + HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED); +} + /* Part of the libgomp plugin interface. Return the number of HSA devices on the system. */ @@ -1338,7 +1358,7 @@ union gomp_device_property_value GOMP_OFFLOAD_get_property (int device, int prop) { struct agent_info *agent = get_agent_info (device); - hsa_region_t region = agent->kernarg_region; + hsa_region_t region = agent->data_region; union gomp_device_property_value propval = { .val = 0 }; @@ -1635,6 +1655,19 @@ GOMP_OFFLOAD_init_device (int n) HSA_DEBUG ("Selected kernel arguments memory region:\n"); dump_hsa_region (agent->kernarg_region, NULL); + agent->data_region.handle = (uint64_t) -1; + status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id, + get_data_memory_region, + &agent->data_region); + if (agent->data_region.handle == (uint64_t) -1) + { + GOMP_PLUGIN_error ("Could not find suitable memory region for device " + "data"); + return false; + } + HSA_DEBUG ("Selected device data memory region:\n"); + dump_hsa_region (agent->data_region, NULL); + HSA_DEBUG ("GCN agent %d initialized\n", n); agent->initialized = true; @@ -1964,7 +1997,7 @@ create_and_finalize_hsa_program (struct agent_info *agent) if (!module->heap) { - status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region, + status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, gcn_kernel_heap_size, (void**)&module->heap); if (status != HSA_STATUS_SUCCESS) @@ -1973,7 +2006,17 @@ create_and_finalize_hsa_program (struct agent_info *agent) goto fail; } - module->heap->size = gcn_kernel_heap_size; + status = hsa_fns.hsa_memory_assign_agent_fn + (module->heap, agent->id, HSA_ACCESS_PERMISSION_RW); + if (status != HSA_STATUS_SUCCESS) + { + hsa_error ("Could not assign GCN heap memory to device", status); + goto fail; + } + + hsa_fns.hsa_memory_copy_fn (&module->heap->size, + &gcn_kernel_heap_size, + sizeof (gcn_kernel_heap_size)); } } @@ -3016,7 +3059,7 @@ GOMP_OFFLOAD_alloc_by_agent (struct agent_info *agent, size_t size) size = 4; void *ptr; - hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region, + hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size, &ptr); if (status != HSA_STATUS_SUCCESS) { @@ -3024,6 +3067,14 @@ GOMP_OFFLOAD_alloc_by_agent (struct agent_info *agent, size_t size) return NULL; } + status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id, + HSA_ACCESS_PERMISSION_RW); + if (status != HSA_STATUS_SUCCESS) + { + hsa_error ("Could not assign data memory to device", status); + return NULL; + } + struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); bool profiling_dispatch_p = __builtin_expect (thr != NULL && thr->prof_info != NULL, false); @@ -3107,27 +3158,11 @@ GOMP_OFFLOAD_free (int device, void *ptr) return true; } -/* Returns true if PTR falls within the bounds of any loaded kernel image. */ - -static bool -image_address_p (struct agent_info *agent, const void *ptr) -{ - Elf64_Addr addr = (Elf64_Addr)ptr; - if (agent->module) - { - if (addr >= agent->module->phys_address_start - && addr <= agent->module->phys_address_end) - return true; - } - return false; -} - struct copy_data { void *dst; const void *src; size_t len; - bool use_hsa_memory_copy; bool using_src_copy; struct goacc_asyncqueue *aq; }; @@ -3139,10 +3174,7 @@ copy_data (void *data_) HSA_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n", data->aq->agent->device_id, data->aq->id, data->len, data->src, data->dst); - if (data->use_hsa_memory_copy) - hsa_fns.hsa_memory_copy_fn (data->dst, data->src, data->len); - else - memcpy (data->dst, data->src, data->len); + hsa_fns.hsa_memory_copy_fn (data->dst, data->src, data->len); if (data->using_src_copy) free ((void *) data->src); free (data); @@ -3150,7 +3182,7 @@ copy_data (void *data_) static void queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src, - size_t len, bool use_hsa_memory_copy, bool using_src_copy) + size_t len, bool using_src_copy) { if (DEBUG_QUEUES) HSA_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n", @@ -3160,7 +3192,6 @@ queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src, data->dst = dst; data->src = src; data->len = len; - data->use_hsa_memory_copy = use_hsa_memory_copy; data->using_src_copy = using_src_copy; data->aq = aq; queue_push_callback (aq, copy_data, data); @@ -3171,13 +3202,7 @@ GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n) { HSA_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device, src, dst); - - /* memcpy only works for addresses allocated with hsa_memory_allocate, - but hsa_memory_copy seems unable to read from .rodata variables. */ - if (image_address_p (get_agent_info (device), src)) - hsa_fns.hsa_memory_copy_fn (dst, src, n); - else - memcpy (dst, src, n); + hsa_fns.hsa_memory_copy_fn (dst, src, n); return true; } @@ -3186,12 +3211,7 @@ GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n) { HSA_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src, device, dst); - /* memcpy only works for addresses allocated with hsa_memory_allocate, - but hsa_memory_copy seems unable to read from .rodata variables. */ - if (image_address_p (get_agent_info (device), dst)) - hsa_fns.hsa_memory_copy_fn (dst, src, n); - else - memcpy (dst, src, n); + hsa_fns.hsa_memory_copy_fn (dst, src, n); return true; } @@ -3206,14 +3226,13 @@ GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n) { struct agent_info *agent = get_agent_info (device); maybe_init_omp_async (agent); - queue_push_copy (agent->omp_async_queue, dst, src, n, false, false); + queue_push_copy (agent->omp_async_queue, dst, src, n, false); return true; } HSA_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n, device, src, device, dst); - /* We can assume that dev2dev moves are always within allocated memory. */ - memcpy (dst, src, n); + hsa_fns.hsa_memory_copy_fn (dst, src, n); return true; } @@ -3272,7 +3291,9 @@ gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs, void **ind_da = GOMP_OFFLOAD_alloc_by_agent (kernel->agent, sizeof (void*) * mapnum); for (size_t i = 0; i < mapnum; i++) - ind_da[i] = devaddrs[i] ? devaddrs[i] : hostaddrs[i]; + hsa_fns.hsa_memory_copy_fn (&ind_da[i], + devaddrs[i] ? &devaddrs[i] : &hostaddrs[i], + sizeof (void *)); struct hsa_kernel_description *hsa_kernel_desc = NULL; for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++) @@ -3582,7 +3603,7 @@ GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src, But, that is probably correct. */ void *src_copy = GOMP_PLUGIN_malloc (n); memcpy (src_copy, src, n); - queue_push_copy (aq, dst, src_copy, n, image_address_p (agent, dst), true); + queue_push_copy (aq, dst, src, n, true); return true; } @@ -3592,7 +3613,7 @@ GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src, { struct agent_info *agent = get_agent_info (device); assert (agent == aq->agent); - queue_push_copy (aq, dst, src, n, image_address_p (agent, src), false); + queue_push_copy (aq, dst, src, n, false); return true; }