From bb7c679902ece35d1427a88e9a79305544b009e1 Mon Sep 17 00:00:00 2001 From: Simon Marchi Date: Mon, 9 Jun 2025 12:09:02 -0400 Subject: [PATCH] gdb/amd-dbgapi: disable forward progress requirement in amd_dbgapi_target_breakpoint::check_status ROCgdb handles target events very slowly when running a test case like this, where a breakpoint is preset on HipTest::vectorADD: for (int i=0; i < numDevices; ++i) { HIPCHECK(hipSetDevice(i)); hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], static_cast(A_d[i]), static_cast(B_d[i]), C_d[i], N); } What happens is: - A kernel is launched - The internal runtime breakpoint is hit during the second hipLaunchKernelGGL call, which causes amd_dbgapi_target_breakpoint::check_status to be called - Meanwhile, all waves of the kernel hit the breakpoint on vectorADD - amd_dbgapi_target_breakpoint::check_status calls process_event_queue, which pulls the thousand of breakpoint hit events from the kernel - As part of handling the breakpoint hit events, we write the PC of the waves that stopped to decrement it. Because the forward progress requirement is not disabled, this causes a suspend/resume of the queue each time, which is time-consuming. The stack trace where this all happens is: #32 0x00007ffff6b9abda in amd_dbgapi_write_register (wave_id=..., register_id=..., offset=0, value_size=8, value=0x7fffea9fdcc0) at /home/smarchi/src/amd-dbgapi/src/register.cpp:587 #33 0x00005555588c0bed in amd_dbgapi_target::store_registers (this=0x55555c7b1d20 , regcache=0x507000002240, regno=470) at /home/smarchi/src/wt/amd/gdb/amd-dbgapi-target.c:2504 #34 0x000055555a5186a1 in target_store_registers (regcache=0x507000002240, regno=470) at /home/smarchi/src/wt/amd/gdb/target.c:3973 #35 0x0000555559fab831 in regcache::raw_write (this=0x507000002240, regnum=470, src=...) at /home/smarchi/src/wt/amd/gdb/regcache.c:890 #36 0x0000555559fabd2b in regcache::cooked_write (this=0x507000002240, regnum=470, src=...) at /home/smarchi/src/wt/amd/gdb/regcache.c:915 #37 0x0000555559fc3ca5 in regcache::cooked_write (this=0x507000002240, regnum=470, val=140737323456768) at /home/smarchi/src/wt/amd/gdb/regcache.c:850 #38 0x0000555559fab09a in regcache_cooked_write_unsigned (regcache=0x507000002240, regnum=470, val=140737323456768) at /home/smarchi/src/wt/amd/gdb/regcache.c:858 #39 0x0000555559fb0678 in regcache_write_pc (regcache=0x507000002240, pc=0x7ffff62bd900) at /home/smarchi/src/wt/amd/gdb/regcache.c:1460 #40 0x00005555588bb37d in process_one_event (event_id=..., event_kind=AMD_DBGAPI_EVENT_KIND_WAVE_STOP) at /home/smarchi/src/wt/amd/gdb/amd-dbgapi-target.c:1873 #41 0x00005555588bbf7b in process_event_queue (process_id=..., until_event_kind=AMD_DBGAPI_EVENT_KIND_BREAKPOINT_RESUME) at /home/smarchi/src/wt/amd/gdb/amd-dbgapi-target.c:2006 #42 0x00005555588b1aca in amd_dbgapi_target_breakpoint::check_status (this=0x511000140900, bs=0x50600014ed00) at /home/smarchi/src/wt/amd/gdb/amd-dbgapi-target.c:890 #43 0x0000555558c50080 in bpstat_stop_status (aspace=0x5070000061b0, bp_addr=0x7fffed0b9ab0, thread=0x518000026c80, ws=..., stop_chain=0x50600014ed00) at /home/smarchi/src/wt/amd/gdb/breakpoint.c:6126 #44 0x000055555984f4ff in handle_signal_stop (ecs=0x7fffeaa40ef0) at /home/smarchi/src/wt/amd/gdb/infrun.c:7169 #45 0x000055555984b889 in handle_inferior_event (ecs=0x7fffeaa40ef0) at /home/smarchi/src/wt/amd/gdb/infrun.c:6621 #46 0x000055555983eab6 in fetch_inferior_event () at /home/smarchi/src/wt/amd/gdb/infrun.c:4750 #47 0x00005555597caa5f in inferior_event_handler (event_type=INF_REG_EVENT) at /home/smarchi/src/wt/amd/gdb/inf-loop.c:42 #48 0x00005555588b838e in handle_target_event (client_data=0x0) at /home/smarchi/src/wt/amd/gdb/amd-dbgapi-target.c:1513 Fix that performance problem by disabling the forward progress requirement in amd_dbgapi_target_breakpoint::check_status, before calling process_event_queue, so that we can process all events efficiently. Since the same performance problem could theoritically happen any time process_event_queue is called with forward progress requirement enabled, add an assert to ensure that forward progress requirement is disabled when process_event_queue is invoked. This makes it necessary to add a require_forward_progress call to amd_dbgapi_finalize_core_attach. It looks a bit strange, since core files don't have execution, but it doesn't hurt. Add a test that replicates this scenario. The test launches a kernel that hits a breakpoint (with an always false condition) repeatedly. Meanwhile, the host process loads an unloads a code object, causing check_status to be called. Bug: SWDEV-482511 Change-Id: Ida86340d679e6bd8462712953458c07ba3fd49ec Approved-by: Lancelot Six --- gdb/amd-dbgapi-target.c | 6 ++ .../code-object-load-while-breakpoint-hit.cpp | 86 +++++++++++++++++++ .../code-object-load-while-breakpoint-hit.exp | 68 +++++++++++++++ 3 files changed, 160 insertions(+) create mode 100644 gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.cpp create mode 100644 gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.exp diff --git a/gdb/amd-dbgapi-target.c b/gdb/amd-dbgapi-target.c index fd4f9cba48a..e2a8ec83404 100644 --- a/gdb/amd-dbgapi-target.c +++ b/gdb/amd-dbgapi-target.c @@ -568,6 +568,8 @@ amd_dbgapi_target_breakpoint::check_status (struct bpstat *bs) if (action == AMD_DBGAPI_BREAKPOINT_ACTION_RESUME) return; + require_forward_progress (*info, false); + /* If the action is AMD_DBGAPI_BREAKPOINT_ACTION_HALT, we need to wait until a breakpoint resume event for this breakpoint_id is seen. */ amd_dbgapi_event_id_t resume_event_id @@ -1335,6 +1337,10 @@ static amd_dbgapi_event_id_t process_event_queue (amd_dbgapi_inferior_info &info, amd_dbgapi_event_kind_t until_event_kind) { + /* Pulling events with forward progress required may result in bad + performance, make sure it is not required. */ + gdb_assert (!info.forward_progress_required); + while (true) { amd_dbgapi_event_id_t event_id; diff --git a/gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.cpp b/gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.cpp new file mode 100644 index 00000000000..d75bc76fda4 --- /dev/null +++ b/gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.cpp @@ -0,0 +1,86 @@ +/* This testcase is part of GDB, the GNU debugger. + + Copyright 2025 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifdef DEVICE + +#include + +constexpr unsigned int NUM_BREAKPOINT_HITS = 5; + +static __device__ void +break_here () +{ +} + +extern "C" __global__ void +kernel () +{ + for (int n = 0; n < NUM_BREAKPOINT_HITS; ++n) + break_here (); +} + +#else + +#include +#include + +constexpr unsigned int NUM_ITEMS_PER_BLOCK = 256; +constexpr unsigned int NUM_BLOCKS = 128; +constexpr unsigned int NUM_ITEMS = NUM_ITEMS_PER_BLOCK * NUM_BLOCKS; +constexpr unsigned int NUM_LOAD_UNLOADS = 5; + +#define CHECK(cmd) \ + { \ + hipError_t error = cmd; \ + if (error != hipSuccess) \ + { \ + fprintf (stderr, "error: '%s'(%d) at %s:%d\n", \ + hipGetErrorString (error), error, __FILE__, __LINE__); \ + exit (EXIT_FAILURE); \ + } \ + } + +int +main (int argc, const char **argv) +{ + if (argc != 2) + { + fprintf (stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const auto module_path = argv[1]; + hipModule_t module; + CHECK (hipModuleLoad (&module, module_path)); + + /* Launch the kernel. */ + hipFunction_t function; + CHECK (hipModuleGetFunction (&function, module, "kernel")); + CHECK (hipModuleLaunchKernel (function, NUM_BLOCKS, 1, 1, + NUM_ITEMS_PER_BLOCK, 1, 1, 0, nullptr, nullptr, + nullptr)); + + /* Load and unload the module many times. */ + for (int i = 0; i < NUM_LOAD_UNLOADS; ++i) + { + hipModule_t dummy_module; + CHECK (hipModuleLoad (&dummy_module, module_path)); + CHECK (hipModuleUnload (dummy_module)); + } +} + +#endif diff --git a/gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.exp b/gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.exp new file mode 100644 index 00000000000..3fe6a952c1d --- /dev/null +++ b/gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.exp @@ -0,0 +1,68 @@ +# Copyright 2025 Free Software Foundation, Inc. + +# This file is part of GDB. + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# This test verifies what happens when a code object list update happens at the +# same time as some wave stop events are reported. It was added following a +# performance bug fix, where forward progress requirement disabled when +# pulling events from amd-dbgapi in amd_dbgapi_target_breakpoint::check_status. +# +# The test launches a kernel that hits a breakpoint with an always false +# condition a certain number of times. Meanwhile, the host loads and unloads +# a code object in a loop, causing check_status to be called. The hope is that +# check_status, when calling process_event_queue, will pull many WAVE_STOP +# events from the kernel hitting the breakpoint. +# +# Without the appropriate fix (of disabling forward progress requirement in +# check_status), GDB would hit the newly-added assert in process_event_queue, +# which verifies that forward progress requirement is disabled. Even without +# this assert, the test would likely time out (depending on the actual timeout +# value). + +load_lib rocm.exp +standard_testfile .cpp +require allow_hipcc_tests + +# Build the host executable. +if { [build_executable "failed to prepare" \ + $testfile $srcfile {debug hip}] == -1 } { + return -1 +} + +set hipmodule_path [standard_output_file ${testfile}.co] + +# Build the kernel object file. +if { [gdb_compile $srcdir/$subdir/$srcfile \ + $hipmodule_path object \ + { debug hip additional_flags=--genco additional_flags=-DDEVICE } ] != "" } { + return -1 +} + +proc do_test { } { + with_rocm_gpu_lock { + clean_restart $::binfile + gdb_test_no_output "set args $::hipmodule_path" "set args" + + if { ![runto_main] } { + return + } + + gdb_test "with breakpoint pending on -- break break_here if 0" + gdb_continue_to_end "continue to end" "continue" 1 + } +} + +do_test -- 2.39.5