The version of nvprof in CUDA 9.0 causes a hang when used to profile an
OpenACC program. This is because it calls acc_get_device_type from
a callback called during device initialization, which then attempts
to acquire acc_device_lock while it is already taken, resulting in
deadlock. This works around the issue by returning acc_device_none
from acc_get_device_type without attempting to acquire the lock when
initialization has not completed yet.
This is a port of commit
a1c022d1b9a43f85f0c451b6422fd095a704fe96 from
releases/gcc-10. Most of the patch is already in this branch, so this
commit only contains changes to existing code.
2020-07-17 Thomas Schwinge <thomas@codesourcery.com>
Kwok Cheung Yeung <kcy@codesourcery.com>
libgomp/
* oacc-init.c (acc_init_1): Move setting of acc_init_state to initializing
to the beginning of the function.
* libgomp.texi (acc_get_device_type): Update documentation.
(Implementation Status and Implementation-Defined Behavior): Likewise.
* testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c: New.
+2020-07-17 Thomas Schwinge <thomas@codesourcery.com>
+ Kwok Cheung Yeung <kcy@codesourcery.com>
+
+ * oacc-init.c (acc_init_1): Move setting of acc_init_state to initializing
+ to the beginning of the function.
+ * libgomp.texi (acc_get_device_type): Update documentation.
+ (Implementation Status and Implementation-Defined Behavior): Likewise.
+ * testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c: New.
+
2020-07-16 Tobias Burnus <tobias@codesourcery.com>
* testsuite/libgomp.oacc-fortran/firstprivate-int.f90: Use
This function returns what device type will be used when executing a
parallel or kernels region.
+This function returns @code{acc_device_none} if
+@code{acc_get_device_type} is called from
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+callbacks of the OpenACC Profiling Interface (@ref{OpenACC Profiling
+Interface}), that is, if the device is currently being initialized.
+
@item @emph{C/C++}:
@multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{acc_device_t acc_get_device_type(void);}
We're not yet accounting for the fact that @cite{OpenACC events may
occur during event processing}.
+We just handle one case specially, as required by CUDA 9.0
+@command{nvprof}, that @code{acc_get_device_type}
+(@ref{acc_get_device_type})) may be called from
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+callbacks.
As currently there are no inquiry functions defined, calls to
@code{acc_prof_lookup} will always return @code{NULL}.
static struct gomp_device_descr *
acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
{
+ gomp_mutex_lock (&acc_init_state_lock);
+ acc_init_state = initializing;
+ acc_init_thread = pthread_self ();
+ gomp_mutex_unlock (&acc_init_state_lock);
+
bool check_not_nested_p;
if (implicit)
{
struct gomp_device_descr *base_dev, *acc_dev;
int ndevs;
- gomp_mutex_lock (&acc_init_state_lock);
- acc_init_state = initializing;
- acc_init_thread = pthread_self ();
- gomp_mutex_unlock (&acc_init_state_lock);
-
base_dev = resolve_device (d, true);
ndevs = base_dev->get_num_devices_func ();
&api_info);
}
+ /* We're setting 'initialized' *after* 'goacc_profiling_dispatch', so that a
+ nested 'acc_get_device_type' called from a profiling callback still sees
+ 'initializing', so that we don't deadlock when it then again tries to lock
+ 'goacc_prof_lock'. See also the discussion in 'acc_get_device_type'. */
gomp_mutex_lock (&acc_init_state_lock);
acc_init_state = initialized;
gomp_mutex_unlock (&acc_init_state_lock);
--- /dev/null
+/* { dg-do run } */
+/* { dg-timeout 10 } */
+
+/* Test the calling of 'acc_get_device_type' from within
+ 'cb_device_init_start' and 'cb_device_init_end' callbacks. This occurs
+ when the CUDA 9.0 'nvprof' tool is used, and previously deadlocked. */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <acc_prof.h>
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+ reg = reg_;
+ unreg = unreg_;
+ lookup = lookup_;
+}
+
+static bool expect_cb_device_init_start;
+static bool expect_cb_device_init_end;
+
+static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ assert (expect_cb_device_init_start);
+ expect_cb_device_init_start = false;
+
+ acc_device_t acc_device_type;
+ acc_device_type = acc_get_device_type ();
+ assert (acc_device_type == acc_device_none);
+
+ expect_cb_device_init_end = true;
+}
+
+static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+ assert (expect_cb_device_init_end);
+ expect_cb_device_init_end = false;
+
+ acc_device_t acc_device_type;
+ acc_device_type = acc_get_device_type ();
+ assert (acc_device_type == acc_device_none);
+}
+
+int main(void)
+{
+ acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
+
+ reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+ reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+
+ expect_cb_device_init_start = true;
+ expect_cb_device_init_end = false;
+ acc_init (acc_device_host);
+ assert (!expect_cb_device_init_start);
+ assert (!expect_cb_device_init_end);
+ {
+ acc_device_t acc_device_type;
+ acc_device_type = acc_get_device_type ();
+ assert (acc_device_type == acc_device_host);
+ }
+ acc_shutdown (acc_device_host);
+
+ expect_cb_device_init_start = true;
+ expect_cb_device_init_end = false;
+ acc_init (acc_device_default);
+ assert (!expect_cb_device_init_start);
+ assert (!expect_cb_device_init_end);
+ {
+ acc_device_t acc_device_type;
+ acc_device_type = acc_get_device_type ();
+ assert (acc_device_type != acc_device_none);
+ }
+ acc_shutdown (acc_device_default);
+
+ return 0;
+}