gh-138709: Implement CPU time profiling in profiling.sample (#138710)

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Fri, 19 Sep 2025 18:17:28 +0000 (19:17 +0100)

committer GitHub <noreply@github.com>

Fri, 19 Sep 2025 18:17:28 +0000 (19:17 +0100)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Fri, 19 Sep 2025 18:17:28 +0000 (19:17 +0100)
committer GitHub <noreply@github.com>
Fri, 19 Sep 2025 18:17:28 +0000 (19:17 +0100)
diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h

index f393537141c076208c34f5a7d262ba251128e1fd..a598af4f37c123721279ee93473cc9b9590e5fdb 100644 (file)
--- a/Include/internal/pycore_global_objects_fini_generated.h
+++ b/Include/internal/pycore_global_objects_fini_generated.h
@@ -1267,6 +1267,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
      _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(size));
      _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sizehint));
      _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(skip_file_prefixes));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(skip_non_matching_threads));
      _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sleep));
      _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sock));
      _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sort));
diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h

index f4fde6142b9e826346fdaa9dd355da3a3cb9fff1..6959343947c1f4794bf0566dde943a34e3c0dc4e 100644 (file)
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@@ -758,6 +758,7 @@ struct _Py_global_strings {
          STRUCT_FOR_ID(size)
          STRUCT_FOR_ID(sizehint)
          STRUCT_FOR_ID(skip_file_prefixes)
+        STRUCT_FOR_ID(skip_non_matching_threads)
          STRUCT_FOR_ID(sleep)
          STRUCT_FOR_ID(sock)
          STRUCT_FOR_ID(sort)
diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h

index 5c0ec7dd5471150d2fbfe8f45d2c0b3ecdcd7cd8..314837c5b3f288f543f24b2165a158f28994edfb 100644 (file)
--- a/Include/internal/pycore_runtime_init_generated.h
+++ b/Include/internal/pycore_runtime_init_generated.h
@@ -1265,6 +1265,7 @@ extern "C" {
      INIT_ID(size), \
      INIT_ID(sizehint), \
      INIT_ID(skip_file_prefixes), \
+    INIT_ID(skip_non_matching_threads), \
      INIT_ID(sleep), \
      INIT_ID(sock), \
      INIT_ID(sort), \
diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h

index 1a7f1c13c6dd1659c63e8e84c4f9112f7c164a9a..45b00a20a07ddaf3c80f55f2b02550f8458be13d 100644 (file)
--- a/Include/internal/pycore_unicodeobject_generated.h
+++ b/Include/internal/pycore_unicodeobject_generated.h
@@ -2820,6 +2820,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
      _PyUnicode_InternStatic(interp, &string);
      assert(_PyUnicode_CheckConsistency(string, 1));
      assert(PyUnicode_GET_LENGTH(string) != 1);
+    string = &_Py_ID(skip_non_matching_threads);
+    _PyUnicode_InternStatic(interp, &string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    assert(PyUnicode_GET_LENGTH(string) != 1);
      string = &_Py_ID(sleep);
      _PyUnicode_InternStatic(interp, &string);
      assert(_PyUnicode_CheckConsistency(string, 1));
diff --git a/Lib/profiling/sampling/collector.py b/Lib/profiling/sampling/collector.py

index 112d3071a1148c2c3c5875fb8fe416818388aef6..3333e7bc99d177f8433ba573a716c8f90d1e8be9 100644 (file)
--- a/Lib/profiling/sampling/collector.py
+++ b/Lib/profiling/sampling/collector.py
@@ -1,5 +1,17 @@
  from abc import ABC, abstractmethod
  
+# Enums are slow
+THREAD_STATE_RUNNING = 0
+THREAD_STATE_IDLE = 1
+THREAD_STATE_GIL_WAIT = 2
+THREAD_STATE_UNKNOWN = 3
+
+STATUS = {
+    THREAD_STATE_RUNNING: "running",
+    THREAD_STATE_IDLE: "idle",
+    THREAD_STATE_GIL_WAIT: "gil_wait",
+    THREAD_STATE_UNKNOWN: "unknown",
+}
  
  class Collector(ABC):
      @abstractmethod
@@ -10,10 +22,12 @@ class Collector(ABC):
      def export(self, filename):
          """Export collected data to a file."""
  
-    def _iter_all_frames(self, stack_frames):
+    def _iter_all_frames(self, stack_frames, skip_idle=False):
          """Iterate over all frame stacks from all interpreters and threads."""
          for interpreter_info in stack_frames:
              for thread_info in interpreter_info.threads:
+                if skip_idle and thread_info.status != THREAD_STATE_RUNNING:
+                    continue
                  frames = thread_info.frame_info
                  if frames:
                      yield frames
diff --git a/Lib/profiling/sampling/pstats_collector.py b/Lib/profiling/sampling/pstats_collector.py

index d492c15bb2aaf8b0e5636bf4155eade4da6d5a79..dec81b60659c538dcfa91a6ae0df9ac482a694f3 100644 (file)
--- a/Lib/profiling/sampling/pstats_collector.py
+++ b/Lib/profiling/sampling/pstats_collector.py
@@ -5,7 +5,7 @@ from .collector import Collector
  
  
  class PstatsCollector(Collector):
-    def __init__(self, sample_interval_usec):
+    def __init__(self, sample_interval_usec, *, skip_idle=False):
          self.result = collections.defaultdict(
              lambda: dict(total_rec_calls=0, direct_calls=0, cumulative_calls=0)
          )
@@ -14,6 +14,7 @@ class PstatsCollector(Collector):
          self.callers = collections.defaultdict(
              lambda: collections.defaultdict(int)
          )
+        self.skip_idle = skip_idle
  
      def _process_frames(self, frames):
          """Process a single thread's frame stack."""
@@ -40,7 +41,7 @@ class PstatsCollector(Collector):
              self.callers[callee][caller] += 1
  
      def collect(self, stack_frames):
-        for frames in self._iter_all_frames(stack_frames):
+        for frames in self._iter_all_frames(stack_frames, skip_idle=self.skip_idle):
              self._process_frames(frames)
  
      def export(self, filename):
diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py

index 8a65f312234730d3aaa9ded4214ae360c824224a..20437481a0af9865f0212acbc54687e75870ffd1 100644 (file)
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@@ -15,6 +15,21 @@ from .pstats_collector import PstatsCollector
  from .stack_collector import CollapsedStackCollector, FlamegraphCollector
  
  _FREE_THREADED_BUILD = sysconfig.get_config_var("Py_GIL_DISABLED") is not None
+
+# Profiling mode constants
+PROFILING_MODE_WALL = 0
+PROFILING_MODE_CPU = 1
+PROFILING_MODE_GIL = 2
+
+
+def _parse_mode(mode_string):
+    """Convert mode string to mode constant."""
+    mode_map = {
+        "wall": PROFILING_MODE_WALL,
+        "cpu": PROFILING_MODE_CPU,
+        "gil": PROFILING_MODE_GIL,
+    }
+    return mode_map[mode_string]
  _HELP_DESCRIPTION = """Sample a process's stack frames and generate profiling data.
  Supports the following target modes:
    - -p PID: Profile an existing process by PID
@@ -120,18 +135,18 @@ def _run_with_sync(original_cmd):
  
  
  class SampleProfiler:
-    def __init__(self, pid, sample_interval_usec, all_threads):
+    def __init__(self, pid, sample_interval_usec, all_threads, *, mode=PROFILING_MODE_WALL):
          self.pid = pid
          self.sample_interval_usec = sample_interval_usec
          self.all_threads = all_threads
          if _FREE_THREADED_BUILD:
              self.unwinder = _remote_debugging.RemoteUnwinder(
-                self.pid, all_threads=self.all_threads
+                self.pid, all_threads=self.all_threads, mode=mode
              )
          else:
              only_active_threads = bool(self.all_threads)
              self.unwinder = _remote_debugging.RemoteUnwinder(
-                self.pid, only_active_thread=only_active_threads
+                self.pid, only_active_thread=only_active_threads, mode=mode
              )
          # Track sample intervals and total sample count
          self.sample_intervals = deque(maxlen=100)
@@ -596,21 +611,25 @@ def sample(
      show_summary=True,
      output_format="pstats",
      realtime_stats=False,
+    mode=PROFILING_MODE_WALL,
  ):
      profiler = SampleProfiler(
-        pid, sample_interval_usec, all_threads=all_threads
+        pid, sample_interval_usec, all_threads=all_threads, mode=mode
      )
      profiler.realtime_stats = realtime_stats
  
+    # Determine skip_idle for collector compatibility
+    skip_idle = mode != PROFILING_MODE_WALL
+
      collector = None
      match output_format:
          case "pstats":
-            collector = PstatsCollector(sample_interval_usec)
+            collector = PstatsCollector(sample_interval_usec, skip_idle=skip_idle)
          case "collapsed":
-            collector = CollapsedStackCollector()
+            collector = CollapsedStackCollector(skip_idle=skip_idle)
              filename = filename or f"collapsed.{pid}.txt"
          case "flamegraph":
-            collector = FlamegraphCollector()
+            collector = FlamegraphCollector(skip_idle=skip_idle)
              filename = filename or f"flamegraph.{pid}.html"
          case _:
              raise ValueError(f"Invalid output format: {output_format}")
@@ -661,6 +680,8 @@ def wait_for_process_and_sample(pid, sort_value, args):
      if not filename and args.format == "collapsed":
          filename = f"collapsed.{pid}.txt"
  
+    mode = _parse_mode(args.mode)
+
      sample(
          pid,
          sort=sort_value,
@@ -672,6 +693,7 @@ def wait_for_process_and_sample(pid, sort_value, args):
          show_summary=not args.no_summary,
          output_format=args.format,
          realtime_stats=args.realtime_stats,
+        mode=mode,
      )
  
  
@@ -726,6 +748,15 @@ def main():
          help="Print real-time sampling statistics (Hz, mean, min, max, stdev) during profiling",
      )
  
+    # Mode options
+    mode_group = parser.add_argument_group("Mode options")
+    mode_group.add_argument(
+        "--mode",
+        choices=["wall", "cpu", "gil"],
+        default="wall",
+        help="Sampling mode: wall (all threads), cpu (only CPU-running threads), gil (only GIL-holding threads)",
+    )
+
      # Output format selection
      output_group = parser.add_argument_group("Output options")
      output_format = output_group.add_mutually_exclusive_group()
@@ -850,6 +881,8 @@ def main():
      elif target_count > 1:
          parser.error("only one target type can be specified: -p/--pid, -m/--module, or script")
  
+    mode = _parse_mode(args.mode)
+
      if args.pid:
          sample(
              args.pid,
@@ -862,6 +895,7 @@ def main():
              show_summary=not args.no_summary,
              output_format=args.format,
              realtime_stats=args.realtime_stats,
+            mode=mode,
          )
      elif args.module or args.args:
          if args.module:
diff --git a/Lib/profiling/sampling/stack_collector.py b/Lib/profiling/sampling/stack_collector.py

index 0588f822cd54f26efafa66bcdb99b529e8983cf2..6983be70ee04406465c315637392923be4f8570c 100644 (file)
--- a/Lib/profiling/sampling/stack_collector.py
+++ b/Lib/profiling/sampling/stack_collector.py
@@ -11,8 +11,11 @@ from .string_table import StringTable
  
  
  class StackTraceCollector(Collector):
-    def collect(self, stack_frames):
-        for frames in self._iter_all_frames(stack_frames):
+    def __init__(self, *, skip_idle=False):
+        self.skip_idle = skip_idle
+
+    def collect(self, stack_frames, skip_idle=False):
+        for frames in self._iter_all_frames(stack_frames, skip_idle=skip_idle):
              if not frames:
                  continue
              self.process_frames(frames)
@@ -22,7 +25,8 @@ class StackTraceCollector(Collector):
  
  
  class CollapsedStackCollector(StackTraceCollector):
-    def __init__(self):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
          self.stack_counter = collections.Counter()
  
      def process_frames(self, frames):
@@ -46,7 +50,8 @@ class CollapsedStackCollector(StackTraceCollector):
  
  
  class FlamegraphCollector(StackTraceCollector):
-    def __init__(self):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
          self.stats = {}
          self._root = {"samples": 0, "children": {}}
          self._total_samples = 0
diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py

index 262e472da7eac3edc82ea0a8417f991eac19a448..2f8f5f0e169339570bb32e1533e2f7b9db9a0df4 100644 (file)
--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@@ -19,6 +19,11 @@ from test.support.socket_helper import find_unused_port
  
  import subprocess
  
+# Profiling mode constants
+PROFILING_MODE_WALL = 0
+PROFILING_MODE_CPU = 1
+PROFILING_MODE_GIL = 2
+
  try:
      from concurrent import interpreters
  except ImportError:
@@ -1670,6 +1675,228 @@ class TestUnsupportedPlatformHandling(unittest.TestCase):
              str(cm.exception)
          )
  
+class TestDetectionOfThreadStatus(unittest.TestCase):
+    @unittest.skipIf(
+        sys.platform not in ("linux", "darwin", "win32"),
+        "Test only runs on unsupported platforms (not Linux, macOS, or Windows)",
+    )
+    @unittest.skipIf(sys.platform == "android", "Android raises Linux-specific exception")
+    def test_thread_status_detection(self):
+        port = find_unused_port()
+        script = textwrap.dedent(
+            f"""\
+            import time, sys, socket, threading
+            import os
+
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.connect(('localhost', {port}))
+
+            def sleeper():
+                tid = threading.get_native_id()
+                sock.sendall(f'ready:sleeper:{{tid}}\\n'.encode())
+                time.sleep(10000)
+
+            def busy():
+                tid = threading.get_native_id()
+                sock.sendall(f'ready:busy:{{tid}}\\n'.encode())
+                x = 0
+                while True:
+                    x = x + 1
+                time.sleep(0.5)
+
+            t1 = threading.Thread(target=sleeper)
+            t2 = threading.Thread(target=busy)
+            t1.start()
+            t2.start()
+            sock.sendall(b'ready:main\\n')
+            t1.join()
+            t2.join()
+            sock.close()
+            """
+        )
+        with os_helper.temp_dir() as work_dir:
+            script_dir = os.path.join(work_dir, "script_pkg")
+            os.mkdir(script_dir)
+            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            server_socket.bind(("localhost", port))
+            server_socket.settimeout(SHORT_TIMEOUT)
+            server_socket.listen(1)
+
+            script_name = _make_test_script(script_dir, "thread_status_script", script)
+            client_socket = None
+            try:
+                p = subprocess.Popen([sys.executable, script_name])
+                client_socket, _ = server_socket.accept()
+                server_socket.close()
+                response = b""
+                sleeper_tid = None
+                busy_tid = None
+                while True:
+                    chunk = client_socket.recv(1024)
+                    response += chunk
+                    if b"ready:main" in response and b"ready:sleeper" in response and b"ready:busy" in response:
+                        # Parse TIDs from the response
+                        for line in response.split(b"\n"):
+                            if line.startswith(b"ready:sleeper:"):
+                                try:
+                                    sleeper_tid = int(line.split(b":")[-1])
+                                except Exception:
+                                    pass
+                            elif line.startswith(b"ready:busy:"):
+                                try:
+                                    busy_tid = int(line.split(b":")[-1])
+                                except Exception:
+                                    pass
+                        break
+
+                attempts = 10
+                try:
+                    unwinder = RemoteUnwinder(p.pid, all_threads=True, mode=PROFILING_MODE_CPU,
+                                                skip_non_matching_threads=False)
+                    for _ in range(attempts):
+                        traces = unwinder.get_stack_trace()
+                        # Check if any thread is running
+                        if any(thread_info.status == 0 for interpreter_info in traces
+                               for thread_info in interpreter_info.threads):
+                            break
+                        time.sleep(0.5)  # Give a bit of time to let threads settle
+                except PermissionError:
+                    self.skipTest(
+                        "Insufficient permissions to read the stack trace"
+                    )
+
+
+                # Find threads and their statuses
+                statuses = {}
+                for interpreter_info in traces:
+                    for thread_info in interpreter_info.threads:
+                        statuses[thread_info.thread_id] = thread_info.status
+
+                self.assertIsNotNone(sleeper_tid, "Sleeper thread id not received")
+                self.assertIsNotNone(busy_tid, "Busy thread id not received")
+                self.assertIn(sleeper_tid, statuses, "Sleeper tid not found in sampled threads")
+                self.assertIn(busy_tid, statuses, "Busy tid not found in sampled threads")
+                self.assertEqual(statuses[sleeper_tid], 1, "Sleeper thread should be idle (1)")
+                self.assertEqual(statuses[busy_tid], 0, "Busy thread should be running (0)")
+
+            finally:
+                if client_socket is not None:
+                    client_socket.close()
+                p.terminate()
+                p.wait(timeout=SHORT_TIMEOUT)
+
+    @unittest.skipIf(
+        sys.platform not in ("linux", "darwin", "win32"),
+        "Test only runs on unsupported platforms (not Linux, macOS, or Windows)",
+    )
+    @unittest.skipIf(sys.platform == "android", "Android raises Linux-specific exception")
+    def test_thread_status_gil_detection(self):
+        port = find_unused_port()
+        script = textwrap.dedent(
+            f"""\
+            import time, sys, socket, threading
+            import os
+
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.connect(('localhost', {port}))
+
+            def sleeper():
+                tid = threading.get_native_id()
+                sock.sendall(f'ready:sleeper:{{tid}}\\n'.encode())
+                time.sleep(10000)
+
+            def busy():
+                tid = threading.get_native_id()
+                sock.sendall(f'ready:busy:{{tid}}\\n'.encode())
+                x = 0
+                while True:
+                    x = x + 1
+                time.sleep(0.5)
+
+            t1 = threading.Thread(target=sleeper)
+            t2 = threading.Thread(target=busy)
+            t1.start()
+            t2.start()
+            sock.sendall(b'ready:main\\n')
+            t1.join()
+            t2.join()
+            sock.close()
+            """
+        )
+        with os_helper.temp_dir() as work_dir:
+            script_dir = os.path.join(work_dir, "script_pkg")
+            os.mkdir(script_dir)
+            server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            server_socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            server_socket.bind(("localhost", port))
+            server_socket.settimeout(SHORT_TIMEOUT)
+            server_socket.listen(1)
+
+            script_name = _make_test_script(script_dir, "thread_status_script", script)
+            client_socket = None
+            try:
+                p = subprocess.Popen([sys.executable, script_name])
+                client_socket, _ = server_socket.accept()
+                server_socket.close()
+                response = b""
+                sleeper_tid = None
+                busy_tid = None
+                while True:
+                    chunk = client_socket.recv(1024)
+                    response += chunk
+                    if b"ready:main" in response and b"ready:sleeper" in response and b"ready:busy" in response:
+                        # Parse TIDs from the response
+                        for line in response.split(b"\n"):
+                            if line.startswith(b"ready:sleeper:"):
+                                try:
+                                    sleeper_tid = int(line.split(b":")[-1])
+                                except Exception:
+                                    pass
+                            elif line.startswith(b"ready:busy:"):
+                                try:
+                                    busy_tid = int(line.split(b":")[-1])
+                                except Exception:
+                                    pass
+                        break
+
+                attempts = 10
+                try:
+                    unwinder = RemoteUnwinder(p.pid, all_threads=True, mode=PROFILING_MODE_GIL,
+                                                skip_non_matching_threads=False)
+                    for _ in range(attempts):
+                        traces = unwinder.get_stack_trace()
+                        # Check if any thread is running
+                        if any(thread_info.status == 0 for interpreter_info in traces
+                               for thread_info in interpreter_info.threads):
+                            break
+                        time.sleep(0.5)  # Give a bit of time to let threads settle
+                except PermissionError:
+                    self.skipTest(
+                        "Insufficient permissions to read the stack trace"
+                    )
+
+
+                # Find threads and their statuses
+                statuses = {}
+                for interpreter_info in traces:
+                    for thread_info in interpreter_info.threads:
+                        statuses[thread_info.thread_id] = thread_info.status
+
+                self.assertIsNotNone(sleeper_tid, "Sleeper thread id not received")
+                self.assertIsNotNone(busy_tid, "Busy thread id not received")
+                self.assertIn(sleeper_tid, statuses, "Sleeper tid not found in sampled threads")
+                self.assertIn(busy_tid, statuses, "Busy tid not found in sampled threads")
+                self.assertEqual(statuses[sleeper_tid], 2, "Sleeper thread should be idle (1)")
+                self.assertEqual(statuses[busy_tid], 0, "Busy thread should be running (0)")
+
+            finally:
+                if client_socket is not None:
+                    client_socket.close()
+                p.terminate()
+                p.wait(timeout=SHORT_TIMEOUT)
+
+
  
  if __name__ == "__main__":
      unittest.main()
diff --git a/Lib/test/test_profiling/test_sampling_profiler.py b/Lib/test/test_profiling/test_sampling_profiler.py

index a6ca0fea0d46e4306ae9b4c263e07ced2aa29102..fd6b1862230288577f06e1603167ffc34816a1a4 100644 (file)
--- a/Lib/test/test_profiling/test_sampling_profiler.py
+++ b/Lib/test/test_profiling/test_sampling_profiler.py
@@ -1999,6 +1999,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
                  show_summary=True,
                  output_format="pstats",
                  realtime_stats=False,
+                mode=0
              )
  
      @unittest.skipIf(is_emscripten, "socket.SO_REUSEADDR does not exist")
@@ -2026,6 +2027,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
                  show_summary=True,
                  output_format="pstats",
                  realtime_stats=False,
+                mode=0
              )
  
      @unittest.skipIf(is_emscripten, "socket.SO_REUSEADDR does not exist")
@@ -2053,6 +2055,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
                  show_summary=True,
                  output_format="pstats",
                  realtime_stats=False,
+                mode=0
              )
  
      @unittest.skipIf(is_emscripten, "socket.SO_REUSEADDR does not exist")
@@ -2152,6 +2155,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
                  show_summary=True,
                  output_format="pstats",
                  realtime_stats=False,
+                mode=0
              )
  
      @unittest.skipIf(is_emscripten, "socket.SO_REUSEADDR does not exist")
@@ -2185,6 +2189,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
                  show_summary=True,
                  output_format="collapsed",
                  realtime_stats=False,
+                mode=0
              )
  
      def test_cli_empty_module_name(self):
@@ -2396,6 +2401,7 @@ class TestSampleProfilerCLI(unittest.TestCase):
                  show_summary=True,
                  output_format="pstats",
                  realtime_stats=False,
+                mode=0
              )
  
      def test_sort_options(self):
@@ -2426,5 +2432,360 @@ class TestSampleProfilerCLI(unittest.TestCase):
                  mock_sample.reset_mock()
  
  
+class TestCpuModeFiltering(unittest.TestCase):
+    """Test CPU mode filtering functionality (--mode=cpu)."""
+
+    def test_mode_validation(self):
+        """Test that CLI validates mode choices correctly."""
+        # Invalid mode choice should raise SystemExit
+        test_args = ["profiling.sampling.sample", "--mode", "invalid", "-p", "12345"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("sys.stderr", io.StringIO()) as mock_stderr,
+            self.assertRaises(SystemExit) as cm,
+        ):
+            profiling.sampling.sample.main()
+
+        self.assertEqual(cm.exception.code, 2)  # argparse error
+        error_msg = mock_stderr.getvalue()
+        self.assertIn("invalid choice", error_msg)
+
+    def test_frames_filtered_with_skip_idle(self):
+        """Test that frames are actually filtered when skip_idle=True."""
+        # Create mock frames with different thread statuses
+        class MockThreadInfoWithStatus:
+            def __init__(self, thread_id, frame_info, status):
+                self.thread_id = thread_id
+                self.frame_info = frame_info
+                self.status = status
+
+        # Create test data: running thread, idle thread, and another running thread
+        test_frames = [
+            MockInterpreterInfo(0, [
+                MockThreadInfoWithStatus(1, [MockFrameInfo("active1.py", 10, "active_func1")], 0),  # RUNNING
+                MockThreadInfoWithStatus(2, [MockFrameInfo("idle.py", 20, "idle_func")], 1),        # IDLE
+                MockThreadInfoWithStatus(3, [MockFrameInfo("active2.py", 30, "active_func2")], 0),  # RUNNING
+            ])
+        ]
+
+        # Test with skip_idle=True - should only process running threads
+        collector_skip = PstatsCollector(sample_interval_usec=1000, skip_idle=True)
+        collector_skip.collect(test_frames)
+
+        # Should only have functions from running threads (status 0)
+        active1_key = ("active1.py", 10, "active_func1")
+        active2_key = ("active2.py", 30, "active_func2")
+        idle_key = ("idle.py", 20, "idle_func")
+
+        self.assertIn(active1_key, collector_skip.result)
+        self.assertIn(active2_key, collector_skip.result)
+        self.assertNotIn(idle_key, collector_skip.result)  # Idle thread should be filtered out
+
+        # Test with skip_idle=False - should process all threads
+        collector_no_skip = PstatsCollector(sample_interval_usec=1000, skip_idle=False)
+        collector_no_skip.collect(test_frames)
+
+        # Should have functions from all threads
+        self.assertIn(active1_key, collector_no_skip.result)
+        self.assertIn(active2_key, collector_no_skip.result)
+        self.assertIn(idle_key, collector_no_skip.result)  # Idle thread should be included
+
+    @requires_subprocess()
+    def test_cpu_mode_integration_filtering(self):
+        """Integration test: CPU mode should only capture active threads, not idle ones."""
+        # Script with one mostly-idle thread and one CPU-active thread
+        cpu_vs_idle_script = '''
+import time
+import threading
+
+def idle_worker():
+    time.sleep(999999)
+
+def cpu_active_worker():
+    x = 1
+    while True:
+        x += 1
+
+def main():
+# Start both threads
+    idle_thread = threading.Thread(target=idle_worker)
+    cpu_thread = threading.Thread(target=cpu_active_worker)
+    idle_thread.start()
+    cpu_thread.start()
+    idle_thread.join()
+    cpu_thread.join()
+
+main()
+
+'''
+        with test_subprocess(cpu_vs_idle_script) as proc:
+            with (
+                io.StringIO() as captured_output,
+                mock.patch("sys.stdout", captured_output),
+            ):
+                try:
+                    profiling.sampling.sample.sample(
+                        proc.pid,
+                        duration_sec=0.5,
+                        sample_interval_usec=5000,
+                        mode=1,  # CPU mode
+                        show_summary=False,
+                        all_threads=True,
+                    )
+                except (PermissionError, RuntimeError) as e:
+                    self.skipTest("Insufficient permissions for remote profiling")
+
+                cpu_mode_output = captured_output.getvalue()
+
+            # Test wall-clock mode (mode=0) - should capture both functions
+            with (
+                io.StringIO() as captured_output,
+                mock.patch("sys.stdout", captured_output),
+            ):
+                try:
+                    profiling.sampling.sample.sample(
+                        proc.pid,
+                        duration_sec=0.5,
+                        sample_interval_usec=5000,
+                        mode=0,  # Wall-clock mode
+                        show_summary=False,
+                        all_threads=True,
+                    )
+                except (PermissionError, RuntimeError) as e:
+                    self.skipTest("Insufficient permissions for remote profiling")
+
+                wall_mode_output = captured_output.getvalue()
+
+            # Verify both modes captured samples
+            self.assertIn("Captured", cpu_mode_output)
+            self.assertIn("samples", cpu_mode_output)
+            self.assertIn("Captured", wall_mode_output)
+            self.assertIn("samples", wall_mode_output)
+
+            # CPU mode should strongly favor cpu_active_worker over mostly_idle_worker
+            self.assertIn("cpu_active_worker", cpu_mode_output)
+            self.assertNotIn("idle_worker", cpu_mode_output)
+
+            # Wall-clock mode should capture both types of work
+            self.assertIn("cpu_active_worker", wall_mode_output)
+            self.assertIn("idle_worker", wall_mode_output)
+
+
+class TestGilModeFiltering(unittest.TestCase):
+    """Test GIL mode filtering functionality (--mode=gil)."""
+
+    def test_gil_mode_validation(self):
+        """Test that CLI accepts gil mode choice correctly."""
+        test_args = ["profiling.sampling.sample", "--mode", "gil", "-p", "12345"]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.sample.sample") as mock_sample,
+        ):
+            try:
+                profiling.sampling.sample.main()
+            except SystemExit:
+                pass  # Expected due to invalid PID
+
+        # Should have attempted to call sample with mode=2 (GIL mode)
+        mock_sample.assert_called_once()
+        call_args = mock_sample.call_args[1]
+        self.assertEqual(call_args["mode"], 2)  # PROFILING_MODE_GIL
+
+    def test_gil_mode_sample_function_call(self):
+        """Test that sample() function correctly uses GIL mode."""
+        with (
+            mock.patch("profiling.sampling.sample.SampleProfiler") as mock_profiler,
+            mock.patch("profiling.sampling.sample.PstatsCollector") as mock_collector,
+        ):
+            # Mock the profiler instance
+            mock_instance = mock.Mock()
+            mock_profiler.return_value = mock_instance
+
+            # Mock the collector instance
+            mock_collector_instance = mock.Mock()
+            mock_collector.return_value = mock_collector_instance
+
+            # Call sample with GIL mode and a filename to avoid pstats creation
+            profiling.sampling.sample.sample(
+                12345,
+                mode=2,  # PROFILING_MODE_GIL
+                duration_sec=1,
+                sample_interval_usec=1000,
+                filename="test_output.txt",
+            )
+
+            # Verify SampleProfiler was created with correct mode
+            mock_profiler.assert_called_once()
+            call_args = mock_profiler.call_args
+            self.assertEqual(call_args[1]['mode'], 2)  # mode parameter
+
+            # Verify profiler.sample was called
+            mock_instance.sample.assert_called_once()
+
+            # Verify collector.export was called since we provided a filename
+            mock_collector_instance.export.assert_called_once_with("test_output.txt")
+
+    def test_gil_mode_collector_configuration(self):
+        """Test that collectors are configured correctly for GIL mode."""
+        with (
+            mock.patch("profiling.sampling.sample.SampleProfiler") as mock_profiler,
+            mock.patch("profiling.sampling.sample.PstatsCollector") as mock_collector,
+        ):
+            # Mock the profiler instance
+            mock_instance = mock.Mock()
+            mock_profiler.return_value = mock_instance
+
+            # Call sample with GIL mode
+            profiling.sampling.sample.sample(
+                12345,
+                mode=2,  # PROFILING_MODE_GIL
+                output_format="pstats",
+            )
+
+            # Verify collector was created with skip_idle=True (since mode != WALL)
+            mock_collector.assert_called_once()
+            call_args = mock_collector.call_args[1]
+            self.assertTrue(call_args['skip_idle'])
+
+    def test_gil_mode_with_collapsed_format(self):
+        """Test GIL mode with collapsed stack format."""
+        with (
+            mock.patch("profiling.sampling.sample.SampleProfiler") as mock_profiler,
+            mock.patch("profiling.sampling.sample.CollapsedStackCollector") as mock_collector,
+        ):
+            # Mock the profiler instance
+            mock_instance = mock.Mock()
+            mock_profiler.return_value = mock_instance
+
+            # Call sample with GIL mode and collapsed format
+            profiling.sampling.sample.sample(
+                12345,
+                mode=2,  # PROFILING_MODE_GIL
+                output_format="collapsed",
+                filename="test_output.txt",
+            )
+
+            # Verify collector was created with skip_idle=True
+            mock_collector.assert_called_once()
+            call_args = mock_collector.call_args[1]
+            self.assertTrue(call_args['skip_idle'])
+
+    def test_gil_mode_cli_argument_parsing(self):
+        """Test CLI argument parsing for GIL mode with various options."""
+        test_args = [
+            "profiling.sampling.sample",
+            "--mode", "gil",
+            "--interval", "500",
+            "--duration", "5",
+            "-p", "12345"
+        ]
+
+        with (
+            mock.patch("sys.argv", test_args),
+            mock.patch("profiling.sampling.sample.sample") as mock_sample,
+        ):
+            try:
+                profiling.sampling.sample.main()
+            except SystemExit:
+                pass  # Expected due to invalid PID
+
+        # Verify all arguments were parsed correctly
+        mock_sample.assert_called_once()
+        call_args = mock_sample.call_args[1]
+        self.assertEqual(call_args["mode"], 2)  # GIL mode
+        self.assertEqual(call_args["sample_interval_usec"], 500)
+        self.assertEqual(call_args["duration_sec"], 5)
+
+    @requires_subprocess()
+    def test_gil_mode_integration_behavior(self):
+        """Integration test: GIL mode should capture GIL-holding threads."""
+        # Create a test script with GIL-releasing operations
+        gil_test_script = '''
+import time
+import threading
+
+def gil_releasing_work():
+    time.sleep(999999)
+
+def gil_holding_work():
+    x = 1
+    while True:
+        x += 1
+
+def main():
+# Start both threads
+    idle_thread = threading.Thread(target=gil_releasing_work)
+    cpu_thread = threading.Thread(target=gil_holding_work)
+    idle_thread.start()
+    cpu_thread.start()
+    idle_thread.join()
+    cpu_thread.join()
+
+main()
+'''
+        with test_subprocess(gil_test_script) as proc:
+            with (
+                io.StringIO() as captured_output,
+                mock.patch("sys.stdout", captured_output),
+            ):
+                try:
+                    profiling.sampling.sample.sample(
+                        proc.pid,
+                        duration_sec=0.5,
+                        sample_interval_usec=5000,
+                        mode=2,  # GIL mode
+                        show_summary=False,
+                        all_threads=True,
+                    )
+                except (PermissionError, RuntimeError) as e:
+                    self.skipTest("Insufficient permissions for remote profiling")
+
+                gil_mode_output = captured_output.getvalue()
+
+            # Test wall-clock mode for comparison
+            with (
+                io.StringIO() as captured_output,
+                mock.patch("sys.stdout", captured_output),
+            ):
+                try:
+                    profiling.sampling.sample.sample(
+                        proc.pid,
+                        duration_sec=0.5,
+                        sample_interval_usec=5000,
+                        mode=0,  # Wall-clock mode
+                        show_summary=False,
+                        all_threads=True,
+                    )
+                except (PermissionError, RuntimeError) as e:
+                    self.skipTest("Insufficient permissions for remote profiling")
+
+                wall_mode_output = captured_output.getvalue()
+
+            # GIL mode should primarily capture GIL-holding work
+            # (Note: actual behavior depends on threading implementation)
+            self.assertIn("gil_holding_work", gil_mode_output)
+
+            # Wall-clock mode should capture both types of work
+            self.assertIn("gil_holding_work", wall_mode_output)
+
+    def test_mode_constants_are_defined(self):
+        """Test that all profiling mode constants are properly defined."""
+        self.assertEqual(profiling.sampling.sample.PROFILING_MODE_WALL, 0)
+        self.assertEqual(profiling.sampling.sample.PROFILING_MODE_CPU, 1)
+        self.assertEqual(profiling.sampling.sample.PROFILING_MODE_GIL, 2)
+
+    def test_parse_mode_function(self):
+        """Test the _parse_mode function with all valid modes."""
+        self.assertEqual(profiling.sampling.sample._parse_mode("wall"), 0)
+        self.assertEqual(profiling.sampling.sample._parse_mode("cpu"), 1)
+        self.assertEqual(profiling.sampling.sample._parse_mode("gil"), 2)
+
+        # Test invalid mode raises KeyError
+        with self.assertRaises(KeyError):
+            profiling.sampling.sample._parse_mode("invalid")
+
+
  if __name__ == "__main__":
      unittest.main()
diff --git a/Modules/_remote_debugging_module.c b/Modules/_remote_debugging_module.c

index c306143ee73b18da4387928a0530037ebd2693d7..701f4b0eabdb158188b1dcb760cc7e753e8f372c 100644 (file)
--- a/Modules/_remote_debugging_module.c
+++ b/Modules/_remote_debugging_module.c
@@ -34,6 +34,31 @@
  #    define HAVE_PROCESS_VM_READV 0
  #endif
  
+// Returns thread status using proc_pidinfo, caches thread_id_offset on first use (macOS only)
+#if defined(__APPLE__) && TARGET_OS_OSX
+#include <libproc.h>
+#include <sys/types.h>
+#define MAX_NATIVE_THREADS 4096
+#endif
+
+#ifdef MS_WINDOWS
+#include <windows.h>
+#include <winternl.h>
+// ntstatus.h conflicts with windows.h so we have to define the NTSTATUS values we need
+#define STATUS_SUCCESS                   ((NTSTATUS)0x00000000L)
+#define STATUS_INFO_LENGTH_MISMATCH      ((NTSTATUS)0xC0000004L)
+typedef enum _WIN32_THREADSTATE {
+    WIN32_THREADSTATE_INITIALIZED = 0,   // Recognized by the kernel
+    WIN32_THREADSTATE_READY       = 1,   // Prepared to run on the next available processor
+    WIN32_THREADSTATE_RUNNING     = 2,   // Currently executing
+    WIN32_THREADSTATE_STANDBY     = 3,   // About to run, only one thread may be in this state at a time
+    WIN32_THREADSTATE_TERMINATED  = 4,   // Finished executing
+    WIN32_THREADSTATE_WAITING     = 5,   // Not ready for the processor, when ready, it will be rescheduled
+    WIN32_THREADSTATE_TRANSITION  = 6,   // Waiting for resources other than the processor
+    WIN32_THREADSTATE_UNKNOWN     = 7    // Thread state is unknown
+} WIN32_THREADSTATE;
+#endif
+
  /* ============================================================================
   * TYPE DEFINITIONS AND STRUCTURES
   * ============================================================================ */
@@ -153,6 +178,7 @@ static PyStructSequence_Desc CoroInfo_desc = {
  // ThreadInfo structseq type - replaces 2-tuple (thread_id, frame_info)
  static PyStructSequence_Field ThreadInfo_fields[] = {
      {"thread_id", "Thread ID"},
+    {"status", "Thread status"},
      {"frame_info", "Frame information"},
      {NULL}
  };
@@ -211,6 +237,19 @@ typedef struct {
      PyTypeObject *AwaitedInfo_Type;
  } RemoteDebuggingState;
  
+enum _ThreadState {
+    THREAD_STATE_RUNNING,
+    THREAD_STATE_IDLE,
+    THREAD_STATE_GIL_WAIT,
+    THREAD_STATE_UNKNOWN
+};
+
+enum _ProfilingMode {
+    PROFILING_MODE_WALL = 0,
+    PROFILING_MODE_CPU = 1,
+    PROFILING_MODE_GIL = 2
+};
+
  typedef struct {
      PyObject_HEAD
      proc_handle_t handle;
@@ -224,12 +263,21 @@ typedef struct {
      _Py_hashtable_t *code_object_cache;
      int debug;
      int only_active_thread;
+    int mode;  // Use enum _ProfilingMode values
+    int skip_non_matching_threads;  // New option to skip threads that don't match mode
      RemoteDebuggingState *cached_state;  // Cached module state
  #ifdef Py_GIL_DISABLED
      // TLBC cache invalidation tracking
      uint32_t tlbc_generation;  // Track TLBC index pool changes
      _Py_hashtable_t *tlbc_cache;  // Cache of TLBC arrays by code object address
  #endif
+#ifdef __APPLE__
+    uint64_t thread_id_offset;
+#endif
+#ifdef MS_WINDOWS
+    PVOID win_process_buffer;
+    ULONG win_process_buffer_size;
+#endif
  } RemoteUnwinderObject;
  
  #define RemoteUnwinder_CAST(op) ((RemoteUnwinderObject *)(op))
@@ -2453,10 +2501,139 @@ process_frame_chain(
      return 0;
  }
  
+static int
+get_thread_status(RemoteUnwinderObject *unwinder, uint64_t tid, uint64_t pthread_id) {
+#if defined(__APPLE__) && TARGET_OS_OSX
+   if (unwinder->thread_id_offset == 0) {
+        uint64_t *tids = (uint64_t *)PyMem_Malloc(MAX_NATIVE_THREADS * sizeof(uint64_t));
+        if (!tids) {
+            PyErr_NoMemory();
+            return -1;
+        }
+        int n = proc_pidinfo(unwinder->handle.pid, PROC_PIDLISTTHREADS, 0, tids, MAX_NATIVE_THREADS * sizeof(uint64_t)) / sizeof(uint64_t);
+        if (n <= 0) {
+            PyMem_Free(tids);
+            return THREAD_STATE_UNKNOWN;
+        }
+        uint64_t min_offset = UINT64_MAX;
+        for (int i = 0; i < n; i++) {
+            uint64_t offset = tids[i] - pthread_id;
+            if (offset < min_offset) {
+                min_offset = offset;
+            }
+        }
+        unwinder->thread_id_offset = min_offset;
+        PyMem_Free(tids);
+    }
+    struct proc_threadinfo ti;
+    uint64_t tid_with_offset = pthread_id + unwinder->thread_id_offset;
+    if (proc_pidinfo(unwinder->handle.pid, PROC_PIDTHREADINFO, tid_with_offset, &ti, sizeof(ti)) != sizeof(ti)) {
+        return THREAD_STATE_UNKNOWN;
+    }
+    if (ti.pth_run_state == TH_STATE_RUNNING) {
+        return THREAD_STATE_RUNNING;
+    }
+    return THREAD_STATE_IDLE;
+#elif defined(__linux__)
+    char stat_path[256];
+    char buffer[2048] = "";
+
+    snprintf(stat_path, sizeof(stat_path), "/proc/%d/task/%lu/stat", unwinder->handle.pid, tid);
+
+    int fd = open(stat_path, O_RDONLY);
+    if (fd == -1) {
+        return THREAD_STATE_UNKNOWN;
+    }
+
+    if (read(fd, buffer, 2047) == 0) {
+        close(fd);
+        return THREAD_STATE_UNKNOWN;
+    }
+    close(fd);
+
+    char *p = strchr(buffer, ')');
+    if (!p) {
+        return THREAD_STATE_UNKNOWN;
+    }
+
+    p += 2;  // Skip ") "
+    if (*p == ' ') {
+        p++;
+    }
+
+    switch (*p) {
+        case 'R':  // Running
+            return THREAD_STATE_RUNNING;
+        case 'S':  // Interruptible sleep
+        case 'D':  // Uninterruptible sleep
+        case 'T':  // Stopped
+        case 'Z':  // Zombie
+        case 'I':  // Idle kernel thread
+            return THREAD_STATE_IDLE;
+        default:
+            return THREAD_STATE_UNKNOWN;
+    }
+#elif defined(MS_WINDOWS)
+    ULONG n;
+    NTSTATUS status = NtQuerySystemInformation(
+        SystemProcessInformation,
+        unwinder->win_process_buffer,
+        unwinder->win_process_buffer_size,
+        &n
+    );
+    if (status == STATUS_INFO_LENGTH_MISMATCH) {
+        // Buffer was too small so we reallocate a larger one and try again.
+        unwinder->win_process_buffer_size = n;
+        PVOID new_buffer = PyMem_Realloc(unwinder->win_process_buffer, n);
+        if (!new_buffer) {
+            return -1;
+        }
+        unwinder->win_process_buffer = new_buffer;
+        return get_thread_status(unwinder, tid, pthread_id);
+    }
+    if (status != STATUS_SUCCESS) {
+        return -1;
+    }
+
+    SYSTEM_PROCESS_INFORMATION *pi = (SYSTEM_PROCESS_INFORMATION *)unwinder->win_process_buffer;
+    while ((ULONG)(ULONG_PTR)pi->UniqueProcessId != unwinder->handle.pid) {
+        if (pi->NextEntryOffset == 0) {
+            // We didn't find the process
+            return -1;
+        }
+        pi = (SYSTEM_PROCESS_INFORMATION *)(((BYTE *)pi) + pi->NextEntryOffset);
+    }
+
+    SYSTEM_THREAD_INFORMATION *ti = (SYSTEM_THREAD_INFORMATION *)((char *)pi + sizeof(SYSTEM_PROCESS_INFORMATION));
+    for (Py_ssize_t i = 0; i < pi->NumberOfThreads; i++, ti++) {
+        if (ti->ClientId.UniqueThread == (HANDLE)tid) {
+            return ti->ThreadState != WIN32_THREADSTATE_RUNNING ? THREAD_STATE_IDLE : THREAD_STATE_RUNNING;
+        }
+    }
+
+    return -1;
+#else
+    return THREAD_STATE_UNKNOWN;
+#endif
+}
+
+typedef struct {
+    unsigned int initialized:1;
+    unsigned int bound:1;
+    unsigned int unbound:1;
+    unsigned int bound_gilstate:1;
+    unsigned int active:1;
+    unsigned int finalizing:1;
+    unsigned int cleared:1;
+    unsigned int finalized:1;
+    unsigned int :24;
+} _thread_status;
+
  static PyObject*
  unwind_stack_for_thread(
      RemoteUnwinderObject *unwinder,
-    uintptr_t *current_tstate
+    uintptr_t *current_tstate,
+    uintptr_t gil_holder_tstate
  ) {
      PyObject *frame_info = NULL;
      PyObject *thread_id = NULL;
@@ -2471,6 +2648,44 @@ unwind_stack_for_thread(
          goto error;
      }
  
+    long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id);
+
+    // Calculate thread status based on mode
+    int status = THREAD_STATE_UNKNOWN;
+    if (unwinder->mode == PROFILING_MODE_CPU) {
+        long pthread_id = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.thread_id);
+        status = get_thread_status(unwinder, tid, pthread_id);
+        if (status == -1) {
+            PyErr_Print();
+            PyErr_SetString(PyExc_RuntimeError, "Failed to get thread status");
+            goto error;
+        }
+    } else if (unwinder->mode == PROFILING_MODE_GIL) {
+#ifdef Py_GIL_DISABLED
+        // All threads are considered running in free threading builds if they have a thread state attached
+        int active = GET_MEMBER(_thread_status, ts, unwinder->debug_offsets.thread_state.status).active;
+        status = active ? THREAD_STATE_RUNNING : THREAD_STATE_GIL_WAIT;
+#else
+        status = (*current_tstate == gil_holder_tstate) ? THREAD_STATE_RUNNING : THREAD_STATE_GIL_WAIT;
+#endif
+    } else {
+        // PROFILING_MODE_WALL - all threads are considered running
+        status = THREAD_STATE_RUNNING;
+    }
+
+    // Check if we should skip this thread based on mode
+    int should_skip = 0;
+    if (unwinder->skip_non_matching_threads && status != THREAD_STATE_RUNNING &&
+        (unwinder->mode == PROFILING_MODE_CPU || unwinder->mode == PROFILING_MODE_GIL)) {
+        should_skip = 1;
+    }
+
+    if (should_skip) {
+        // Advance to next thread and return NULL to skip processing
+        *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
+        return NULL;
+    }
+
      uintptr_t frame_addr = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.current_frame);
  
      frame_info = PyList_New(0);
@@ -2491,8 +2706,7 @@ unwind_stack_for_thread(
  
      *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
  
-    thread_id = PyLong_FromLongLong(
-        GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id));
+    thread_id = PyLong_FromLongLong(tid);
      if (thread_id == NULL) {
          set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID");
          goto error;
@@ -2505,8 +2719,16 @@ unwind_stack_for_thread(
          goto error;
      }
  
-    PyStructSequence_SetItem(result, 0, thread_id);  // Steals reference
-    PyStructSequence_SetItem(result, 1, frame_info); // Steals reference
+    PyObject *py_status = PyLong_FromLong(status);
+    if (py_status == NULL) {
+        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread status");
+        goto error;
+    }
+    PyErr_Print();
+
+    PyStructSequence_SetItem(result, 0, thread_id);
+    PyStructSequence_SetItem(result, 1, py_status);  // Steals reference
+    PyStructSequence_SetItem(result, 2, frame_info); // Steals reference
  
      cleanup_stack_chunks(&chunks);
      return result;
@@ -2537,7 +2759,9 @@ _remote_debugging.RemoteUnwinder.__init__
      *
      all_threads: bool = False
      only_active_thread: bool = False
+    mode: int = 0
      debug: bool = False
+    skip_non_matching_threads: bool = True
  
  Initialize a new RemoteUnwinder object for debugging a remote Python process.
  
@@ -2546,9 +2770,12 @@ Args:
      all_threads: If True, initialize state for all threads in the process.
                  If False, only initialize for the main thread.
      only_active_thread: If True, only sample the thread holding the GIL.
+    mode: Profiling mode: 0=WALL (wall-time), 1=CPU (cpu-time), 2=GIL (gil-time).
                         Cannot be used together with all_threads=True.
      debug: If True, chain exceptions to explain the sequence of events that
             lead to the exception.
+    skip_non_matching_threads: If True, skip threads that don't match the selected mode.
+                              If False, include all threads regardless of mode.
  
  The RemoteUnwinder provides functionality to inspect and debug a running Python
  process, including examining thread states, stack frames and other runtime data.
@@ -2564,8 +2791,9 @@ static int
  _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
                                                 int pid, int all_threads,
                                                 int only_active_thread,
-                                               int debug)
-/*[clinic end generated code: output=13ba77598ecdcbe1 input=cfc21663fbe263c4]*/
+                                               int mode, int debug,
+                                               int skip_non_matching_threads)
+/*[clinic end generated code: output=abf5ea5cd58bcb36 input=08fb6ace023ec3b5]*/
  {
      // Validate that all_threads and only_active_thread are not both True
      if (all_threads && only_active_thread) {
@@ -2584,6 +2812,8 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
  
      self->debug = debug;
      self->only_active_thread = only_active_thread;
+    self->mode = mode;
+    self->skip_non_matching_threads = skip_non_matching_threads;
      self->cached_state = NULL;
      if (_Py_RemoteDebug_InitProcHandle(&self->handle, pid) < 0) {
          set_exception_cause(self, PyExc_RuntimeError, "Failed to initialize process handle");
@@ -2656,6 +2886,15 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
      }
  #endif
  
+#if defined(__APPLE__)
+    self->thread_id_offset = 0;
+#endif
+
+#ifdef MS_WINDOWS
+    self->win_process_buffer = NULL;
+    self->win_process_buffer_size = 0;
+#endif
+
      return 0;
  }
  
@@ -2761,21 +3000,25 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
              goto exit;
          }
  
+        // Get the GIL holder for this interpreter (needed for GIL_WAIT logic)
+        uintptr_t gil_holder_tstate = 0;
+        int gil_locked = GET_MEMBER(int, interp_state_buffer,
+            self->debug_offsets.interpreter_state.gil_runtime_state_locked);
+        if (gil_locked) {
+            gil_holder_tstate = (uintptr_t)GET_MEMBER(PyThreadState*, interp_state_buffer,
+                self->debug_offsets.interpreter_state.gil_runtime_state_holder);
+        }
+
          uintptr_t current_tstate;
          if (self->only_active_thread) {
              // Find the GIL holder for THIS interpreter
-            int gil_locked = GET_MEMBER(int, interp_state_buffer,
-                self->debug_offsets.interpreter_state.gil_runtime_state_locked);
-
              if (!gil_locked) {
                  // This interpreter's GIL is not locked, skip it
                  Py_DECREF(interpreter_threads);
                  goto next_interpreter;
              }
  
-            // Get the GIL holder for this interpreter
-            current_tstate = (uintptr_t)GET_MEMBER(PyThreadState*, interp_state_buffer,
-                self->debug_offsets.interpreter_state.gil_runtime_state_holder);
+            current_tstate = gil_holder_tstate;
          } else if (self->tstate_addr == 0) {
              // Get all threads for this interpreter
              current_tstate = GET_MEMBER(uintptr_t, interp_state_buffer,
@@ -2786,8 +3029,14 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
          }
  
          while (current_tstate != 0) {
-            PyObject* frame_info = unwind_stack_for_thread(self, &current_tstate);
+            PyObject* frame_info = unwind_stack_for_thread(self, &current_tstate, gil_holder_tstate);
              if (!frame_info) {
+                // Check if this was an intentional skip due to mode-based filtering
+                if ((self->mode == PROFILING_MODE_CPU || self->mode == PROFILING_MODE_GIL) && !PyErr_Occurred()) {
+                    // Thread was skipped due to mode filtering, continue to next thread
+                    continue;
+                }
+                // This was an actual error
                  Py_DECREF(interpreter_threads);
                  set_exception_cause(self, PyExc_RuntimeError, "Failed to unwind stack for thread");
                  Py_CLEAR(result);
@@ -3038,6 +3287,12 @@ RemoteUnwinder_dealloc(PyObject *op)
      if (self->code_object_cache) {
          _Py_hashtable_destroy(self->code_object_cache);
      }
+#ifdef MS_WINDOWS
+    if(self->win_process_buffer != NULL) {
+        PyMem_Free(self->win_process_buffer);
+    }
+#endif
+
  #ifdef Py_GIL_DISABLED
      if (self->tlbc_cache) {
          _Py_hashtable_destroy(self->tlbc_cache);
diff --git a/Modules/clinic/_remote_debugging_module.c.h b/Modules/clinic/_remote_debugging_module.c.h

index 9bfcdde407fe3cbe4f651bef35c9f006e3e1c27b..7dd54e3124887bc6c9f1c619f51feb4f337778c4 100644 (file)
--- a/Modules/clinic/_remote_debugging_module.c.h
+++ b/Modules/clinic/_remote_debugging_module.c.h
@@ -11,7 +11,7 @@ preserve
  
  PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
  "RemoteUnwinder(pid, *, all_threads=False, only_active_thread=False,\n"
-"               debug=False)\n"
+"               mode=0, debug=False, skip_non_matching_threads=True)\n"
  "--\n"
  "\n"
  "Initialize a new RemoteUnwinder object for debugging a remote Python process.\n"
@@ -21,9 +21,12 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder___init____doc__,
  "    all_threads: If True, initialize state for all threads in the process.\n"
  "                If False, only initialize for the main thread.\n"
  "    only_active_thread: If True, only sample the thread holding the GIL.\n"
+"    mode: Profiling mode: 0=WALL (wall-time), 1=CPU (cpu-time), 2=GIL (gil-time).\n"
  "                       Cannot be used together with all_threads=True.\n"
  "    debug: If True, chain exceptions to explain the sequence of events that\n"
  "           lead to the exception.\n"
+"    skip_non_matching_threads: If True, skip threads that don\'t match the selected mode.\n"
+"                              If False, include all threads regardless of mode.\n"
  "\n"
  "The RemoteUnwinder provides functionality to inspect and debug a running Python\n"
  "process, including examining thread states, stack frames and other runtime data.\n"
@@ -38,7 +41,8 @@ static int
  _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
                                                 int pid, int all_threads,
                                                 int only_active_thread,
-                                               int debug);
+                                               int mode, int debug,
+                                               int skip_non_matching_threads);
  
  static int
  _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObject *kwargs)
@@ -46,7 +50,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
      int return_value = -1;
      #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
  
-    #define NUM_KEYWORDS 4
+    #define NUM_KEYWORDS 6
      static struct {
          PyGC_Head _this_is_not_used;
          PyObject_VAR_HEAD
@@ -55,7 +59,7 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
      } _kwtuple = {
          .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS)
          .ob_hash = -1,
-        .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(debug), },
+        .ob_item = { &_Py_ID(pid), &_Py_ID(all_threads), &_Py_ID(only_active_thread), &_Py_ID(mode), &_Py_ID(debug), &_Py_ID(skip_non_matching_threads), },
      };
      #undef NUM_KEYWORDS
      #define KWTUPLE (&_kwtuple.ob_base.ob_base)
@@ -64,21 +68,23 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
      #  define KWTUPLE NULL
      #endif  // !Py_BUILD_CORE
  
-    static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "debug", NULL};
+    static const char * const _keywords[] = {"pid", "all_threads", "only_active_thread", "mode", "debug", "skip_non_matching_threads", NULL};
      static _PyArg_Parser _parser = {
          .keywords = _keywords,
          .fname = "RemoteUnwinder",
          .kwtuple = KWTUPLE,
      };
      #undef KWTUPLE
-    PyObject *argsbuf[4];
+    PyObject *argsbuf[6];
      PyObject * const *fastargs;
      Py_ssize_t nargs = PyTuple_GET_SIZE(args);
      Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 1;
      int pid;
      int all_threads = 0;
      int only_active_thread = 0;
+    int mode = 0;
      int debug = 0;
+    int skip_non_matching_threads = 1;
  
      fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser,
              /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf);
@@ -110,12 +116,30 @@ _remote_debugging_RemoteUnwinder___init__(PyObject *self, PyObject *args, PyObje
              goto skip_optional_kwonly;
          }
      }
-    debug = PyObject_IsTrue(fastargs[3]);
-    if (debug < 0) {
+    if (fastargs[3]) {
+        mode = PyLong_AsInt(fastargs[3]);
+        if (mode == -1 && PyErr_Occurred()) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
+    }
+    if (fastargs[4]) {
+        debug = PyObject_IsTrue(fastargs[4]);
+        if (debug < 0) {
+            goto exit;
+        }
+        if (!--noptargs) {
+            goto skip_optional_kwonly;
+        }
+    }
+    skip_non_matching_threads = PyObject_IsTrue(fastargs[5]);
+    if (skip_non_matching_threads < 0) {
          goto exit;
      }
  skip_optional_kwonly:
-    return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, debug);
+    return_value = _remote_debugging_RemoteUnwinder___init___impl((RemoteUnwinderObject *)self, pid, all_threads, only_active_thread, mode, debug, skip_non_matching_threads);
  
  exit:
      return return_value;
@@ -297,4 +321,4 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace(PyObject *self, PyObject
  
      return return_value;
  }
-/*[clinic end generated code: output=2ba15411abf82c33 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=2caefeddf7683d32 input=a9049054013a1b77]*/
diff --git a/PCbuild/_remote_debugging.vcxproj b/PCbuild/_remote_debugging.vcxproj

index c55f2908e03d33761a6b785f8188c6318bc6dd81..a01905fdf2f43715783a7bb40d7d07585eb99ad5 100644 (file)
--- a/PCbuild/_remote_debugging.vcxproj
+++ b/PCbuild/_remote_debugging.vcxproj
@@ -92,6 +92,11 @@
    <PropertyGroup>
      <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
    </PropertyGroup>
+  <ItemDefinitionGroup>
+  <Link>
+    <AdditionalDependencies>ntdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+  </Link>
+</ItemDefinitionGroup>
    <ItemGroup>
      <ClCompile Include="..\Modules\_remote_debugging_module.c" />
    </ItemGroup>
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Fri, 19 Sep 2025 18:17:28 +0000 (19:17 +0100)
committer	GitHub <noreply@github.com>
	Fri, 19 Sep 2025 18:17:28 +0000 (19:17 +0100)
Include/internal/pycore_global_objects_fini_generated.h		patch \| blob \| blame \| history
Include/internal/pycore_global_strings.h		patch \| blob \| blame \| history
Include/internal/pycore_runtime_init_generated.h		patch \| blob \| blame \| history
Include/internal/pycore_unicodeobject_generated.h		patch \| blob \| blame \| history
Lib/profiling/sampling/collector.py		patch \| blob \| blame \| history
Lib/profiling/sampling/pstats_collector.py		patch \| blob \| blame \| history
Lib/profiling/sampling/sample.py		patch \| blob \| blame \| history
Lib/profiling/sampling/stack_collector.py		patch \| blob \| blame \| history
Lib/test/test_external_inspection.py		patch \| blob \| blame \| history
Lib/test/test_profiling/test_sampling_profiler.py		patch \| blob \| blame \| history
Modules/_remote_debugging_module.c		patch \| blob \| blame \| history
Modules/clinic/_remote_debugging_module.c.h		patch \| blob \| blame \| history
PCbuild/_remote_debugging.vcxproj		patch \| blob \| blame \| history