From d7aaef810db54c9039c71ab72c13022b1fa24f1d Mon Sep 17 00:00:00 2001 From: Kwok Cheung Yeung Date: Fri, 17 Jul 2020 06:06:48 -0700 Subject: [PATCH] libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof The version of nvprof in CUDA 9.0 causes a hang when used to profile an OpenACC program. This is because it calls acc_get_device_type from a callback called during device initialization, which then attempts to acquire acc_device_lock while it is already taken, resulting in deadlock. This works around the issue by returning acc_device_none from acc_get_device_type without attempting to acquire the lock when initialization has not completed yet. This is a port of commit a1c022d1b9a43f85f0c451b6422fd095a704fe96 from releases/gcc-10. Most of the patch is already in this branch, so this commit only contains changes to existing code. 2020-07-17 Thomas Schwinge Kwok Cheung Yeung libgomp/ * oacc-init.c (acc_init_1): Move setting of acc_init_state to initializing to the beginning of the function. * libgomp.texi (acc_get_device_type): Update documentation. (Implementation Status and Implementation-Defined Behavior): Likewise. * testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c: New. --- libgomp/ChangeLog.omp | 9 +++ libgomp/libgomp.texi | 11 +++ libgomp/oacc-init.c | 14 ++-- .../acc_prof-init-2.c | 80 +++++++++++++++++++ 4 files changed, 109 insertions(+), 5 deletions(-) create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c diff --git a/libgomp/ChangeLog.omp b/libgomp/ChangeLog.omp index 64980dbc38a9..db88fa64661b 100644 --- a/libgomp/ChangeLog.omp +++ b/libgomp/ChangeLog.omp @@ -1,3 +1,12 @@ +2020-07-17 Thomas Schwinge + Kwok Cheung Yeung + + * oacc-init.c (acc_init_1): Move setting of acc_init_state to initializing + to the beginning of the function. + * libgomp.texi (acc_get_device_type): Update documentation. + (Implementation Status and Implementation-Defined Behavior): Likewise. + * testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c: New. + 2020-07-16 Tobias Burnus * testsuite/libgomp.oacc-fortran/firstprivate-int.f90: Use diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi index 31a16fe2a4b8..a119eda9ee1c 100644 --- a/libgomp/libgomp.texi +++ b/libgomp/libgomp.texi @@ -1970,6 +1970,12 @@ in @var{devicetype}, to use when executing a parallel or kernels region. This function returns what device type will be used when executing a parallel or kernels region. +This function returns @code{acc_device_none} if +@code{acc_get_device_type} is called from +@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end} +callbacks of the OpenACC Profiling Interface (@ref{OpenACC Profiling +Interface}), that is, if the device is currently being initialized. + @item @emph{C/C++}: @multitable @columnfractions .20 .80 @item @emph{Prototype}: @tab @code{acc_device_t acc_get_device_type(void);} @@ -3385,6 +3391,11 @@ every event that has been registered. We're not yet accounting for the fact that @cite{OpenACC events may occur during event processing}. +We just handle one case specially, as required by CUDA 9.0 +@command{nvprof}, that @code{acc_get_device_type} +(@ref{acc_get_device_type})) may be called from +@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end} +callbacks. As currently there are no inquiry functions defined, calls to @code{acc_prof_lookup} will always return @code{NULL}. diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c index 22105915d540..8e8795308e28 100644 --- a/libgomp/oacc-init.c +++ b/libgomp/oacc-init.c @@ -233,6 +233,11 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs) static struct gomp_device_descr * acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit) { + gomp_mutex_lock (&acc_init_state_lock); + acc_init_state = initializing; + acc_init_thread = pthread_self (); + gomp_mutex_unlock (&acc_init_state_lock); + bool check_not_nested_p; if (implicit) { @@ -295,11 +300,6 @@ acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit) struct gomp_device_descr *base_dev, *acc_dev; int ndevs; - gomp_mutex_lock (&acc_init_state_lock); - acc_init_state = initializing; - acc_init_thread = pthread_self (); - gomp_mutex_unlock (&acc_init_state_lock); - base_dev = resolve_device (d, true); ndevs = base_dev->get_num_devices_func (); @@ -327,6 +327,10 @@ acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit) &api_info); } + /* We're setting 'initialized' *after* 'goacc_profiling_dispatch', so that a + nested 'acc_get_device_type' called from a profiling callback still sees + 'initializing', so that we don't deadlock when it then again tries to lock + 'goacc_prof_lock'. See also the discussion in 'acc_get_device_type'. */ gomp_mutex_lock (&acc_init_state_lock); acc_init_state = initialized; gomp_mutex_unlock (&acc_init_state_lock); diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c new file mode 100644 index 000000000000..b4e9f188aa6e --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c @@ -0,0 +1,80 @@ +/* { dg-do run } */ +/* { dg-timeout 10 } */ + +/* Test the calling of 'acc_get_device_type' from within + 'cb_device_init_start' and 'cb_device_init_end' callbacks. This occurs + when the CUDA 9.0 'nvprof' tool is used, and previously deadlocked. */ + +#include +#include +#include + +static acc_prof_reg reg; +static acc_prof_reg unreg; +static acc_prof_lookup_func lookup; + +void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_) +{ + reg = reg_; + unreg = unreg_; + lookup = lookup_; +} + +static bool expect_cb_device_init_start; +static bool expect_cb_device_init_end; + +static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + assert (expect_cb_device_init_start); + expect_cb_device_init_start = false; + + acc_device_t acc_device_type; + acc_device_type = acc_get_device_type (); + assert (acc_device_type == acc_device_none); + + expect_cb_device_init_end = true; +} + +static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + assert (expect_cb_device_init_end); + expect_cb_device_init_end = false; + + acc_device_t acc_device_type; + acc_device_type = acc_get_device_type (); + assert (acc_device_type == acc_device_none); +} + +int main(void) +{ + acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup); + + reg (acc_ev_device_init_start, cb_device_init_start, acc_reg); + reg (acc_ev_device_init_end, cb_device_init_end, acc_reg); + + expect_cb_device_init_start = true; + expect_cb_device_init_end = false; + acc_init (acc_device_host); + assert (!expect_cb_device_init_start); + assert (!expect_cb_device_init_end); + { + acc_device_t acc_device_type; + acc_device_type = acc_get_device_type (); + assert (acc_device_type == acc_device_host); + } + acc_shutdown (acc_device_host); + + expect_cb_device_init_start = true; + expect_cb_device_init_end = false; + acc_init (acc_device_default); + assert (!expect_cb_device_init_start); + assert (!expect_cb_device_init_end); + { + acc_device_t acc_device_type; + acc_device_type = acc_get_device_type (); + assert (acc_device_type != acc_device_none); + } + acc_shutdown (acc_device_default); + + return 0; +} -- 2.47.2