From bb7c679902ece35d1427a88e9a79305544b009e1 Mon Sep 17 00:00:00 2001
From: Simon Marchi <simon.marchi@efficios.com>
Date: Mon, 9 Jun 2025 12:09:02 -0400
Subject: [PATCH] gdb/amd-dbgapi: disable forward progress requirement in
 amd_dbgapi_target_breakpoint::check_status

ROCgdb handles target events very slowly when running a test case like
this, where a breakpoint is preset on HipTest::vectorADD:

    for (int i=0; i < numDevices; ++i) {
      HIPCHECK(hipSetDevice(i));
      hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i],
                        static_cast<const int*>(A_d[i]), static_cast<const int*>(B_d[i]), C_d[i], N);
    }

What happens is:

 - A kernel is launched
 - The internal runtime breakpoint is hit during the second
   hipLaunchKernelGGL call, which causes
   amd_dbgapi_target_breakpoint::check_status to be called
 - Meanwhile, all waves of the kernel hit the breakpoint on vectorADD
 - amd_dbgapi_target_breakpoint::check_status calls process_event_queue,
   which pulls the thousand of breakpoint hit events from the kernel
 - As part of handling the breakpoint hit events, we write the PC of the
   waves that stopped to decrement it.  Because the forward progress
   requirement is not disabled, this causes a suspend/resume of the
   queue each time, which is time-consuming.

The stack trace where this all happens is:

    #32 0x00007ffff6b9abda in amd_dbgapi_write_register (wave_id=..., register_id=..., offset=0, value_size=8, value=0x7fffea9fdcc0) at /home/smarchi/src/amd-dbgapi/src/register.cpp:587
    #33 0x00005555588c0bed in amd_dbgapi_target::store_registers (this=0x55555c7b1d20 <the_amd_dbgapi_target>, regcache=0x507000002240, regno=470) at /home/smarchi/src/wt/amd/gdb/amd-dbgapi-target.c:2504
    #34 0x000055555a5186a1 in target_store_registers (regcache=0x507000002240, regno=470) at /home/smarchi/src/wt/amd/gdb/target.c:3973
    #35 0x0000555559fab831 in regcache::raw_write (this=0x507000002240, regnum=470, src=...) at /home/smarchi/src/wt/amd/gdb/regcache.c:890
    #36 0x0000555559fabd2b in regcache::cooked_write (this=0x507000002240, regnum=470, src=...) at /home/smarchi/src/wt/amd/gdb/regcache.c:915
    #37 0x0000555559fc3ca5 in regcache::cooked_write<unsigned long, void> (this=0x507000002240, regnum=470, val=140737323456768) at /home/smarchi/src/wt/amd/gdb/regcache.c:850
    #38 0x0000555559fab09a in regcache_cooked_write_unsigned (regcache=0x507000002240, regnum=470, val=140737323456768) at /home/smarchi/src/wt/amd/gdb/regcache.c:858
    #39 0x0000555559fb0678 in regcache_write_pc (regcache=0x507000002240, pc=0x7ffff62bd900) at /home/smarchi/src/wt/amd/gdb/regcache.c:1460
    #40 0x00005555588bb37d in process_one_event (event_id=..., event_kind=AMD_DBGAPI_EVENT_KIND_WAVE_STOP) at /home/smarchi/src/wt/amd/gdb/amd-dbgapi-target.c:1873
    #41 0x00005555588bbf7b in process_event_queue (process_id=..., until_event_kind=AMD_DBGAPI_EVENT_KIND_BREAKPOINT_RESUME) at /home/smarchi/src/wt/amd/gdb/amd-dbgapi-target.c:2006
    #42 0x00005555588b1aca in amd_dbgapi_target_breakpoint::check_status (this=0x511000140900, bs=0x50600014ed00) at /home/smarchi/src/wt/amd/gdb/amd-dbgapi-target.c:890
    #43 0x0000555558c50080 in bpstat_stop_status (aspace=0x5070000061b0, bp_addr=0x7fffed0b9ab0, thread=0x518000026c80, ws=..., stop_chain=0x50600014ed00) at /home/smarchi/src/wt/amd/gdb/breakpoint.c:6126
    #44 0x000055555984f4ff in handle_signal_stop (ecs=0x7fffeaa40ef0) at /home/smarchi/src/wt/amd/gdb/infrun.c:7169
    #45 0x000055555984b889 in handle_inferior_event (ecs=0x7fffeaa40ef0) at /home/smarchi/src/wt/amd/gdb/infrun.c:6621
    #46 0x000055555983eab6 in fetch_inferior_event () at /home/smarchi/src/wt/amd/gdb/infrun.c:4750
    #47 0x00005555597caa5f in inferior_event_handler (event_type=INF_REG_EVENT) at /home/smarchi/src/wt/amd/gdb/inf-loop.c:42
    #48 0x00005555588b838e in handle_target_event (client_data=0x0) at /home/smarchi/src/wt/amd/gdb/amd-dbgapi-target.c:1513

Fix that performance problem by disabling the forward progress
requirement in amd_dbgapi_target_breakpoint::check_status, before
calling process_event_queue, so that we can process all events
efficiently.

Since the same performance problem could theoritically happen any time
process_event_queue is called with forward progress requirement enabled,
add an assert to ensure that forward progress requirement is disabled
when process_event_queue is invoked.  This makes it necessary to add a
require_forward_progress call to amd_dbgapi_finalize_core_attach.  It
looks a bit strange, since core files don't have execution, but it
doesn't hurt.

Add a test that replicates this scenario.  The test launches a kernel
that hits a breakpoint (with an always false condition) repeatedly.
Meanwhile, the host process loads an unloads a code object, causing
check_status to be called.

Bug: SWDEV-482511
Change-Id: Ida86340d679e6bd8462712953458c07ba3fd49ec
Approved-by: Lancelot Six <lancelot.six@amd.com>
---
 gdb/amd-dbgapi-target.c                       |  6 ++
 .../code-object-load-while-breakpoint-hit.cpp | 86 +++++++++++++++++++
 .../code-object-load-while-breakpoint-hit.exp | 68 +++++++++++++++
 3 files changed, 160 insertions(+)
 create mode 100644 gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.cpp
 create mode 100644 gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.exp
diff --git a/gdb/amd-dbgapi-target.c b/gdb/amd-dbgapi-target.c
index fd4f9cba48a..e2a8ec83404 100644
--- a/gdb/amd-dbgapi-target.c
+++ b/gdb/amd-dbgapi-target.c
@@ -568,6 +568,8 @@ amd_dbgapi_target_breakpoint::check_status (struct bpstat *bs)
   if (action == AMD_DBGAPI_BREAKPOINT_ACTION_RESUME)
     return;
 
+  require_forward_progress (*info, false);
+
   /* If the action is AMD_DBGAPI_BREAKPOINT_ACTION_HALT, we need to wait until
      a breakpoint resume event for this breakpoint_id is seen.  */
   amd_dbgapi_event_id_t resume_event_id
@@ -1335,6 +1337,10 @@ static amd_dbgapi_event_id_t
 process_event_queue (amd_dbgapi_inferior_info &info,
 		     amd_dbgapi_event_kind_t until_event_kind)
 {
+  /* Pulling events with forward progress required may result in bad
+     performance, make sure it is not required.  */
+  gdb_assert (!info.forward_progress_required);
+
   while (true)
     {
       amd_dbgapi_event_id_t event_id;
diff --git a/gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.cpp b/gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.cpp
new file mode 100644
index 00000000000..d75bc76fda4
--- /dev/null
+++ b/gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.cpp
@@ -0,0 +1,86 @@
+/* This testcase is part of GDB, the GNU debugger.
+
+   Copyright 2025 Free Software Foundation, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#ifdef DEVICE
+
+#include <hip/hip_runtime.h>
+
+constexpr unsigned int NUM_BREAKPOINT_HITS = 5;
+
+static __device__ void
+break_here ()
+{
+}
+
+extern "C" __global__ void
+kernel ()
+{
+  for (int n = 0; n < NUM_BREAKPOINT_HITS; ++n)
+    break_here ();
+}
+
+#else
+
+#include <hip/hip_runtime.h>
+#include <unistd.h>
+
+constexpr unsigned int NUM_ITEMS_PER_BLOCK = 256;
+constexpr unsigned int NUM_BLOCKS = 128;
+constexpr unsigned int NUM_ITEMS = NUM_ITEMS_PER_BLOCK * NUM_BLOCKS;
+constexpr unsigned int NUM_LOAD_UNLOADS = 5;
+
+#define CHECK(cmd)                                                            \
+  {                                                                           \
+    hipError_t error = cmd;                                                   \
+    if (error != hipSuccess)                                                  \
+      {                                                                       \
+	fprintf (stderr, "error: '%s'(%d) at %s:%d\n",                        \
+		 hipGetErrorString (error), error, __FILE__, __LINE__);       \
+	exit (EXIT_FAILURE);                                                  \
+      }                                                                       \
+  }
+
+int
+main (int argc, const char **argv)
+{
+  if (argc != 2)
+    {
+      fprintf (stderr, "Usage: %s <hip_module_path>\n", argv[0]);
+      return 1;
+    }
+
+  const auto module_path = argv[1];
+  hipModule_t module;
+  CHECK (hipModuleLoad (&module, module_path));
+
+  /* Launch the kernel.  */
+  hipFunction_t function;
+  CHECK (hipModuleGetFunction (&function, module, "kernel"));
+  CHECK (hipModuleLaunchKernel (function, NUM_BLOCKS, 1, 1,
+				NUM_ITEMS_PER_BLOCK, 1, 1, 0, nullptr, nullptr,
+				nullptr));
+
+  /* Load and unload the module many times.  */
+  for (int i = 0; i < NUM_LOAD_UNLOADS; ++i)
+    {
+      hipModule_t dummy_module;
+      CHECK (hipModuleLoad (&dummy_module, module_path));
+      CHECK (hipModuleUnload (dummy_module));
+    }
+}
+
+#endif
diff --git a/gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.exp b/gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.exp
new file mode 100644
index 00000000000..3fe6a952c1d
--- /dev/null
+++ b/gdb/testsuite/gdb.rocm/code-object-load-while-breakpoint-hit.exp
@@ -0,0 +1,68 @@
+# Copyright 2025 Free Software Foundation, Inc.
+
+# This file is part of GDB.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# This test verifies what happens when a code object list update happens at the
+# same time as some wave stop events are reported.  It was added following a
+# performance bug fix, where forward progress requirement disabled when
+# pulling events from amd-dbgapi in amd_dbgapi_target_breakpoint::check_status.
+#
+# The test launches a kernel that hits a breakpoint with an always false
+# condition a certain number of times.  Meanwhile, the host loads and unloads
+# a code object in a loop, causing check_status to be called.  The hope is that
+# check_status, when calling process_event_queue, will pull many WAVE_STOP
+# events from the kernel hitting the breakpoint.
+#
+# Without the appropriate fix (of disabling forward progress requirement in
+# check_status), GDB would hit the newly-added assert in process_event_queue,
+# which verifies that forward progress requirement is disabled.  Even without
+# this assert, the test would likely time out (depending on the actual timeout
+# value).
+
+load_lib rocm.exp
+standard_testfile .cpp
+require allow_hipcc_tests
+
+# Build the host executable.
+if { [build_executable "failed to prepare" \
+	  $testfile $srcfile {debug hip}] == -1 } {
+    return -1
+}
+
+set hipmodule_path [standard_output_file ${testfile}.co]
+
+# Build the kernel object file.
+if { [gdb_compile $srcdir/$subdir/$srcfile \
+	$hipmodule_path object \
+	{ debug hip additional_flags=--genco additional_flags=-DDEVICE } ] != "" } {
+    return -1
+}
+
+proc do_test { } {
+    with_rocm_gpu_lock {
+	clean_restart $::binfile
+	gdb_test_no_output "set args $::hipmodule_path" "set args"
+
+	if { ![runto_main] } {
+		return
+	}
+
+	gdb_test "with breakpoint pending on -- break break_here if 0"
+	gdb_continue_to_end "continue to end" "continue" 1
+    }
+}
+
+do_test
-- 
2.39.5