gh-150662: Stop unbounded memory growth in Tachyon `--gecko` collector (#150845)

author Maurycy Pawłowski-Wieroński <maurycy@maurycy.com>

Sat, 6 Jun 2026 02:27:41 +0000 (04:27 +0200)

committer GitHub <noreply@github.com>

Sat, 6 Jun 2026 02:27:41 +0000 (02:27 +0000)
author Maurycy Pawłowski-Wieroński <maurycy@maurycy.com>
Sat, 6 Jun 2026 02:27:41 +0000 (04:27 +0200)
committer GitHub <noreply@github.com>
Sat, 6 Jun 2026 02:27:41 +0000 (02:27 +0000)
diff --git a/Lib/profiling/sampling/gecko_collector.py b/Lib/profiling/sampling/gecko_collector.py

index 54392af95000082932f0f8ec23b1d97474294bab..361f6037f216fdc7444ce763299aad1c9cd9514d 100644 (file)
--- a/Lib/profiling/sampling/gecko_collector.py
+++ b/Lib/profiling/sampling/gecko_collector.py
@@ -1,8 +1,10 @@
  import itertools
+import io
  import json
  import os
  import platform
  import sys
+import tempfile
  import threading
  import time
  
@@ -61,6 +63,77 @@ FRAME_INLINE_DEPTH_ROOT = 0
  PROCESS_TYPE_MAIN = 0
  STACKWALK_DISABLED = 0
  
+# In-memory buffer before spilling to disk
+DEFAULT_SPILL_BUFFER_BYTES = 128 * 1024
+_JSON_SEPARATORS = (",", ":")
+_JSON_ENCODER = json.JSONEncoder(
+    separators=_JSON_SEPARATORS, allow_nan=False
+)
+
+
+class SpillColumn:
+    def __init__(self, directory, basename, *,
+                 buffer_bytes=None):
+        self.path = os.path.join(directory, basename)
+        self.buffer = bytearray()
+        self._buffer_bytes = (
+            DEFAULT_SPILL_BUFFER_BYTES if buffer_bytes is None
+            else buffer_bytes
+        )
+
+    def append(self, value):
+        self.buffer += (_JSON_ENCODER.encode(value) + "\n").encode("utf-8")
+        if len(self.buffer) >= self._buffer_bytes:
+            self.flush()
+
+    def flush(self):
+        with open(self.path, "ab") as file:
+            file.write(self.buffer)
+        self.buffer.clear()
+
+    def iter_tokens(self):
+        with open(self.path, encoding="utf-8") as file:
+            for line in file:
+                yield line.rstrip("\n")
+
+
+class GeckoThreadSpill:
+    _COLUMNS = (
+        ("samples_stack", "samples-stack.json"),
+        ("samples_time", "samples-time.json"),
+        ("markers_name", "markers-name.json"),
+        ("markers_start_time", "markers-start-time.json"),
+        ("markers_end_time", "markers-end-time.json"),
+        ("markers_phase", "markers-phase.json"),
+        ("markers_category", "markers-category.json"),
+        ("markers_data", "markers-data.json"),
+    )
+
+    def __init__(self, directory, tid):
+        prefix = f"thread-{tid}-"
+        for attr, basename in self._COLUMNS:
+            setattr(self, attr, SpillColumn(directory, prefix + basename))
+        self.sample_count = 0
+        self.marker_count = 0
+
+    def append_sample(self, stack_index, time_ms):
+        self.samples_stack.append(stack_index)
+        self.samples_time.append(time_ms)
+        self.sample_count += 1
+
+    def append_marker(self, name_idx, start_time, end_time, phase, category, data):
+        self.markers_name.append(name_idx)
+        self.markers_start_time.append(start_time)
+        self.markers_end_time.append(end_time)
+        self.markers_phase.append(phase)
+        self.markers_category.append(category)
+        self.markers_data.append(data)
+        self.marker_count += 1
+
+    def prepare_read(self):
+        for attr, _basename in self._COLUMNS:
+            getattr(self, attr).flush()
+
  
  class GeckoCollector(Collector):
      aggregating = True
@@ -77,6 +150,8 @@ class GeckoCollector(Collector):
  
          # Per-thread data structures
          self.threads = {}  # tid -> thread data
+        self.spill_dir = None
+        self.exported = False
  
          # Global tables
          self.libs = []
@@ -151,6 +226,9 @@ class GeckoCollector(Collector):
              stack_frames: List of interpreter/thread frame info
              timestamps_us: List of timestamps in microseconds (None for live sampling)
          """
+        if self.exported:
+            raise RuntimeError("cannot append to GeckoCollector after export")
+
          # Handle live sampling (no timestamps provided)
          if timestamps_us is None:
              current_time = (time.monotonic() * 1000) - self.start_time
@@ -259,15 +337,9 @@ class GeckoCollector(Collector):
                  stack_index = self._process_stack(thread_data, frames)
  
                  # Add samples with timestamps
-                samples = thread_data["samples"]
-                samples_stack = samples["stack"]
-                samples_time = samples["time"]
-                samples_delay = samples["eventDelay"]
-
+                thread_spill = thread_data["_spill"]
                  for t in times:
-                    samples_stack.append(stack_index)
-                    samples_time.append(t)
-                    samples_delay.append(None)
+                    thread_spill.append_sample(stack_index, t)
  
                  # Handle opcodes
                  if self.opcodes_enabled and frames:
@@ -294,6 +366,8 @@ class GeckoCollector(Collector):
  
      def _create_thread(self, tid, is_main_thread):
          """Create a new thread structure with processed profile format."""
+        if self.spill_dir is None:
+            self.spill_dir = tempfile.TemporaryDirectory()
  
          thread = {
              "name": f"Thread-{tid}",
@@ -307,15 +381,6 @@ class GeckoCollector(Collector):
              "tid": tid,
              "processType": "default",
              "processName": "Python Process",
-            # Sample data - processed format with direct arrays
-            "samples": {
-                "stack": [],
-                "time": [],
-                "eventDelay": [],
-                "weight": None,
-                "weightType": "samples",
-                "length": 0,  # Will be updated on export
-            },
              # Stack table - processed format
              "stackTable": {
                  "frame": [],
@@ -366,21 +431,12 @@ class GeckoCollector(Collector):
                  "functionSize": [],
                  "length": 0,
              },
-            # Markers - processed format (arrays)
-            "markers": {
-                "data": [],
-                "name": [],
-                "startTime": [],
-                "endTime": [],
-                "phase": [],
-                "category": [],
-                "length": 0,
-            },
              # Caches for deduplication
              "_stackCache": {},
              "_frameCache": {},
              "_funcCache": {},
              "_resourceCache": {},
+            "_spill": GeckoThreadSpill(self.spill_dir.name, tid),
          }
  
          return thread
@@ -405,51 +461,42 @@ class GeckoCollector(Collector):
          if tid not in self.threads:
              return
  
-        thread_data = self.threads[tid]
          duration = end_time - start_time
  
          name_idx = self._intern_string(name)
-        markers = thread_data["markers"]
-        markers["name"].append(name_idx)
-        markers["startTime"].append(start_time)
-        markers["endTime"].append(end_time)
-        markers["phase"].append(1)  # 1 = interval marker
-        markers["category"].append(category)
-        markers["data"].append({
-            "type": name.replace(" ", ""),
-            "duration": duration,
-            "tid": tid
-        })
-
-    def _add_opcode_interval_marker(self, tid, opcode, lineno, col_offset, funcname, start_time, end_time):
+        self.threads[tid]["_spill"].append_marker(
+            name_idx, start_time, end_time, 1, category, {
+                "type": name.replace(" ", ""),
+                "duration": duration,
+                "tid": tid,
+            }
+        )
+
+    def _add_opcode_interval_marker(self, tid, opcode, lineno, col_offset,
+                                    funcname, start_time, end_time):
          """Add an interval marker for opcode execution span."""
          if tid not in self.threads or opcode is None:
              return
  
-        thread_data = self.threads[tid]
          opcode_info = get_opcode_info(opcode)
          # Use formatted opcode name (with base opcode for specialized ones)
          formatted_opname = format_opcode(opcode)
  
          name_idx = self._intern_string(formatted_opname)
  
-        markers = thread_data["markers"]
-        markers["name"].append(name_idx)
-        markers["startTime"].append(start_time)
-        markers["endTime"].append(end_time)
-        markers["phase"].append(1)  # 1 = interval marker
-        markers["category"].append(CATEGORY_OPCODES)
-        markers["data"].append({
-            "type": "Opcode",
-            "opcode": opcode,
-            "opname": formatted_opname,
-            "base_opname": opcode_info["base_opname"],
-            "is_specialized": opcode_info["is_specialized"],
-            "line": lineno,
-            "column": col_offset if col_offset >= 0 else None,
-            "function": funcname,
-            "duration": end_time - start_time,
-        })
+        self.threads[tid]["_spill"].append_marker(
+            name_idx, start_time, end_time, 1, CATEGORY_OPCODES, {
+                "type": "Opcode",
+                "opcode": opcode,
+                "opname": formatted_opname,
+                "base_opname": opcode_info["base_opname"],
+                "is_specialized": opcode_info["is_specialized"],
+                "line": lineno,
+                "column": col_offset if col_offset >= 0 else None,
+                "function": funcname,
+                "duration": end_time - start_time,
+            }
+        )
  
      def _process_stack(self, thread_data, frames):
          """Process a stack and return the stack index."""
@@ -660,7 +707,6 @@ class GeckoCollector(Collector):
  
      def export(self, filename):
          """Export the profile to a Gecko JSON file."""
-
          if self.sample_count > 0 and self.last_sample_time > 0:
              self.interval = self.last_sample_time / self.sample_count
  
@@ -681,19 +727,30 @@ class GeckoCollector(Collector):
          spinner_thread = threading.Thread(target=spin, daemon=True)
          spinner_thread.start()
  
+        temp_path = None
+        replaced = False
          try:
-            # Finalize any open markers before building profile
-            self._finalize_markers()
-
-            profile = self._build_profile()
-
-            with open(filename, "w") as f:
-                json.dump(profile, f, separators=(",", ":"))
+            self._prepare_for_serialization()
+            output_dir = os.path.dirname(os.path.abspath(filename)) or "."
+            with tempfile.NamedTemporaryFile(
+                "w", dir=output_dir, delete=False
+            ) as file:
+                temp_path = file.name
+                self._stream_profile(file)
+            os.replace(temp_path, filename)
+            replaced = True
          finally:
+            self.exported = True
              stop_spinner.set()
              spinner_thread.join(timeout=1.0)
              # Small delay to ensure the clear happens
              time.sleep(0.01)
+            if temp_path is not None and not replaced:
+                try:
+                    os.unlink(temp_path)
+                except FileNotFoundError:
+                    pass
+            self._cleanup_spills()
  
          print(f"Gecko profile written to {filename}")
          print(
@@ -727,34 +784,17 @@ class GeckoCollector(Collector):
  
      def _build_profile(self):
          """Build the complete profile structure in processed format."""
-        # Convert thread data to final format
-        threads = []
-
-        for tid, thread_data in self.threads.items():
-            # Update lengths
-            samples = thread_data["samples"]
-            stack_table = thread_data["stackTable"]
-            frame_table = thread_data["frameTable"]
-            func_table = thread_data["funcTable"]
-            resource_table = thread_data["resourceTable"]
-
-            samples["length"] = len(samples["stack"])
-            stack_table["length"] = len(stack_table["frame"])
-            frame_table["length"] = len(frame_table["func"])
-            func_table["length"] = len(func_table["name"])
-            resource_table["length"] = len(resource_table["name"])
-            thread_data["markers"]["length"] = len(thread_data["markers"]["name"])
-
-            # Clean up internal caches
-            del thread_data["_stackCache"]
-            del thread_data["_frameCache"]
-            del thread_data["_funcCache"]
-            del thread_data["_resourceCache"]
-
-            threads.append(thread_data)
-
-        # Main profile structure in processed format
-        profile = {
+        try:
+            self._prepare_for_serialization()
+            file = io.StringIO()
+            self._stream_profile(file)
+            return json.loads(file.getvalue())
+        finally:
+            self.exported = True
+            self._cleanup_spills()
+
+    def _profile_head(self):
+        return {
              "meta": {
                  "interval": self.interval,
                  "startTime": self.start_time,
@@ -784,7 +824,10 @@ class GeckoCollector(Collector):
                  },
              },
              "libs": self.libs,
-            "threads": threads,
+        }
+
+    def _profile_tail(self):
+        return {
              "pages": [],
              "shared": {
                  "stringArray": self.global_strings,
@@ -792,4 +835,146 @@ class GeckoCollector(Collector):
              },
          }
  
-        return profile
+    def _prepare_for_serialization(self):
+        if self.exported:
+            raise RuntimeError("GeckoCollector has already been exported")
+        self._finalize_markers()
+        for thread_data in self.threads.values():
+            thread_data["_spill"].prepare_read()
+            thread_data["stackTable"]["length"] = len(thread_data["stackTable"]["frame"])
+            thread_data["frameTable"]["length"] = len(thread_data["frameTable"]["func"])
+            thread_data["funcTable"]["length"] = len(thread_data["funcTable"]["name"])
+            thread_data["resourceTable"]["length"] = len(thread_data["resourceTable"]["name"])
+
+    def _cleanup_spills(self):
+        if self.spill_dir is not None:
+            self.spill_dir.cleanup()
+            self.spill_dir = None
+
+    def _stream_profile(self, file):
+        file.write("{")
+        first = True
+        for key, value in self._profile_head().items():
+            first = _write_json_member(file, key, value, first)
+
+        first = _write_member_name(file, "threads", first)
+        file.write("[")
+        for index, (tid, thread_data) in enumerate(self.threads.items()):
+            if index:
+                file.write(",")
+            self._stream_thread(file, tid, thread_data)
+        file.write("]")
+
+        for key, value in self._profile_tail().items():
+            first = _write_json_member(file, key, value, first)
+        file.write("}")
+
+    def _stream_thread(self, file, tid, thread_data):
+        spill = thread_data["_spill"]
+        metadata = {
+            "name": thread_data["name"],
+            "isMainThread": thread_data["isMainThread"],
+            "processStartupTime": thread_data["processStartupTime"],
+            "processShutdownTime": thread_data["processShutdownTime"],
+            "registerTime": thread_data["registerTime"],
+            "unregisterTime": thread_data["unregisterTime"],
+            "pausedRanges": thread_data["pausedRanges"],
+            "pid": thread_data["pid"],
+            "tid": thread_data["tid"],
+            "processType": thread_data["processType"],
+            "processName": thread_data["processName"],
+        }
+        file.write("{")
+        first = True
+        for key, value in metadata.items():
+            first = _write_json_member(file, key, value, first)
+
+        first = _write_member_name(file, "samples", first)
+        self._stream_samples(file, spill)
+        for key in (
+            "stackTable",
+            "frameTable",
+            "funcTable",
+            "resourceTable",
+            "nativeSymbols",
+        ):
+            first = _write_json_member(file, key, thread_data[key], first)
+        first = _write_member_name(file, "markers", first)
+        self._stream_markers(file, spill)
+        file.write("}")
+
+    def _stream_samples(self, file, spill):
+        _stream_column_table(
+            file,
+            (
+                ("stack", spill.samples_stack.iter_tokens()),
+                ("time", spill.samples_time.iter_tokens()),
+                ("eventDelay", ("null" for _ in range(spill.sample_count))),
+            ),
+            spill.sample_count,
+            (
+                ("weight", None),
+                ("weightType", "samples"),
+                ("length", spill.sample_count),
+            ),
+        )
+
+    def _stream_markers(self, file, spill):
+        _stream_column_table(
+            file,
+            (
+                ("data", spill.markers_data.iter_tokens()),
+                ("name", spill.markers_name.iter_tokens()),
+                ("startTime", spill.markers_start_time.iter_tokens()),
+                ("endTime", spill.markers_end_time.iter_tokens()),
+                ("phase", spill.markers_phase.iter_tokens()),
+                ("category", spill.markers_category.iter_tokens()),
+            ),
+            spill.marker_count,
+            (("length", spill.marker_count),),
+        )
+
+
+def _write_json(file, value):
+    for chunk in _JSON_ENCODER.iterencode(value):
+        file.write(chunk)
+
+
+def _write_member_name(file, name, first):
+    if not first:
+        file.write(",")
+    _write_json(file, name)
+    file.write(":")
+    return False
+
+
+def _write_json_member(file, name, value, first):
+    first = _write_member_name(file, name, first)
+    _write_json(file, value)
+    return first
+
+
+def _stream_column_table(file, columns, expected_count, trailing_members=()):
+    file.write("{")
+    first = True
+    for name, token_iter in columns:
+        first = _write_member_name(file, name, first)
+        _stream_array(file, token_iter, expected_count, name)
+    for name, value in trailing_members:
+        first = _write_json_member(file, name, value, first)
+    file.write("}")
+
+
+def _stream_array(file, token_iter, expected_count, label="array"):
+    file.write("[")
+    count = 0
+    for token in token_iter:
+        if count:
+            file.write(",")
+        file.write(token)
+        count += 1
+    if count != expected_count:
+        raise RuntimeError(
+            f"streamed {count} {label} items, expected {expected_count}"
+        )
+    file.write("]")
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py

index 390a1479fdd2975b011f5ea7e1b0b69d6d44d9d5..1ab31af67fec522c9ffdecf59216efffd87e03be 100644 (file)
--- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
@@ -11,6 +11,7 @@ from test.support import is_emscripten
  
  try:
      import _remote_debugging  # noqa: F401
+    from profiling.sampling import gecko_collector
      from profiling.sampling.pstats_collector import PstatsCollector
      from profiling.sampling.stack_collector import (
          CollapsedStackCollector,
@@ -59,6 +60,42 @@ def find_child_by_name(children, strings, substr):
      return None
  
  
+def export_gecko_profile(testcase, collector):
+    gecko_out = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
+    testcase.addCleanup(close_and_unlink, gecko_out)
+    # We cannot overwrite an open file on Windows.
+    gecko_out.close()
+
+    with captured_stdout(), captured_stderr():
+        collector.export(gecko_out.name)
+
+    testcase.assertGreater(os.path.getsize(gecko_out.name), 0)
+    with open(gecko_out.name, encoding="utf-8") as file:
+        return json.load(file)
+
+
+def assert_gecko_column_lengths(testcase, table, columns):
+    expected = table["length"]
+    for column in columns:
+        testcase.assertEqual(
+            len(table[column]), expected,
+            f"{column!r} has wrong length",
+        )
+
+
+def gecko_marker_names(profile, markers):
+    string_array = profile["shared"]["stringArray"]
+    return [string_array[idx] for idx in markers["name"]]
+
+
+def gecko_opcode_marker_data(profile):
+    markers = profile["threads"][0]["markers"]
+    return [
+        data for data in markers["data"]
+        if data.get("type") == "Opcode"
+    ]
+
+
  class TestSampleProfilerComponents(unittest.TestCase):
      """Unit tests for individual profiler components."""
  
@@ -583,9 +620,10 @@ class TestSampleProfilerComponents(unittest.TestCase):
  
          # Verify samples
          samples = thread_data["samples"]
-        self.assertEqual(len(samples["stack"]), 1)
-        self.assertEqual(len(samples["time"]), 1)
          self.assertEqual(samples["length"], 1)
+        assert_gecko_column_lengths(
+            self, samples, ("stack", "time", "eventDelay")
+        )
  
          # Verify function table structure and content
          func_table = thread_data["funcTable"]
@@ -622,9 +660,6 @@ class TestSampleProfilerComponents(unittest.TestCase):
      @unittest.skipIf(is_emscripten, "threads not available")
      def test_gecko_collector_export(self):
          """Test Gecko profile export functionality."""
-        gecko_out = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
-        self.addCleanup(close_and_unlink, gecko_out)
-
          collector = GeckoCollector(1000)
  
          test_frames1 = [
@@ -657,17 +692,7 @@ class TestSampleProfilerComponents(unittest.TestCase):
          collector.collect(test_frames2)
          collector.collect(test_frames3)
  
-        # Export gecko profile
-        with captured_stdout(), captured_stderr():
-            collector.export(gecko_out.name)
-
-        # Verify file was created and contains valid data
-        self.assertTrue(os.path.exists(gecko_out.name))
-        self.assertGreater(os.path.getsize(gecko_out.name), 0)
-
-        # Check file contains valid JSON
-        with open(gecko_out.name, "r") as f:
-            profile_data = json.load(f)
+        profile_data = export_gecko_profile(self, collector)
  
          # Should be valid Gecko profile format
          self.assertIn("meta", profile_data)
@@ -688,6 +713,100 @@ class TestSampleProfilerComponents(unittest.TestCase):
          self.assertIn("func2", string_array)
          self.assertIn("other_func", string_array)
  
+        thread_data = profile_data["threads"][0]
+        assert_gecko_column_lengths(
+            self, thread_data["samples"], ("stack", "time", "eventDelay")
+        )
+
+    @unittest.skipIf(is_emscripten, "threads not available")
+    def test_gecko_collector_export_after_spill_flush(self):
+        """Test Gecko profile export after spill buffers flush to disk."""
+        old_buffer_bytes = gecko_collector.DEFAULT_SPILL_BUFFER_BYTES
+        gecko_collector.DEFAULT_SPILL_BUFFER_BYTES = 1
+        self.addCleanup(
+            setattr, gecko_collector, "DEFAULT_SPILL_BUFFER_BYTES",
+            old_buffer_bytes
+        )
+
+        collector = GeckoCollector(1000)
+        test_frames = [
+            MockInterpreterInfo(
+                0,
+                [
+                    MockThreadInfo(
+                        1,
+                        [MockFrameInfo("file.py", 10, "func")],
+                        status=THREAD_STATUS_HAS_GIL,
+                    )
+                ],
+            )
+        ]
+        collector.collect(test_frames, timestamps_us=[1000, 2000, 3000])
+
+        profile_data = export_gecko_profile(self, collector)
+        samples = profile_data["threads"][0]["samples"]
+        self.assertEqual(samples["length"], 3)
+        assert_gecko_column_lengths(
+            self, samples, ("stack", "time", "eventDelay")
+        )
+
+    @unittest.skipIf(is_emscripten, "threads not available")
+    def test_gecko_collector_rejects_collect_after_export(self):
+        collector = GeckoCollector(1000)
+        test_frames = [
+            MockInterpreterInfo(
+                0,
+                [
+                    MockThreadInfo(
+                        1,
+                        [MockFrameInfo("file.py", 10, "func")],
+                        status=THREAD_STATUS_HAS_GIL,
+                    )
+                ],
+            )
+        ]
+        collector.collect(test_frames)
+        export_gecko_profile(self, collector)
+
+        with self.assertRaisesRegex(RuntimeError, "after export"):
+            collector.collect(test_frames)
+
+    @unittest.skipIf(is_emscripten, "threads not available")
+    def test_gecko_collector_export_failure_keeps_existing_file(self):
+        collector = GeckoCollector(1000)
+        test_frames = [
+            MockInterpreterInfo(
+                0,
+                [
+                    MockThreadInfo(
+                        1,
+                        [MockFrameInfo("file.py", 10, "func")],
+                        status=THREAD_STATUS_HAS_GIL,
+                    )
+                ],
+            )
+        ]
+        collector.collect(test_frames)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            filename = os.path.join(temp_dir, "profile.json")
+            with open(filename, "w", encoding="utf-8") as file:
+                file.write("existing")
+
+            before = set(os.listdir(temp_dir))
+
+            def fail(file):
+                raise OSError("boom")
+
+            collector._stream_profile = fail
+            with captured_stdout(), captured_stderr():
+                with self.assertRaisesRegex(OSError, "boom"):
+                    collector.export(filename)
+
+            with open(filename, encoding="utf-8") as file:
+                self.assertEqual(file.read(), "existing")
+            self.assertEqual(set(os.listdir(temp_dir)), before)
+
      def test_gecko_collector_markers(self):
          """Test Gecko profile markers for GIL and CPU state tracking."""
          collector = GeckoCollector(1000)
@@ -771,21 +890,16 @@ class TestSampleProfilerComponents(unittest.TestCase):
          self.assertIn("markers", thread_data)
          markers = thread_data["markers"]
  
-        # Should have marker arrays
-        self.assertIn("name", markers)
-        self.assertIn("startTime", markers)
-        self.assertIn("endTime", markers)
-        self.assertIn("category", markers)
          self.assertGreater(
              markers["length"], 0, "Should have generated markers"
          )
-
-        # Get marker names from string table
-        string_array = profile_data["shared"]["stringArray"]
-        marker_names = [string_array[idx] for idx in markers["name"]]
+        assert_gecko_column_lengths(
+            self, markers,
+            ("data", "name", "startTime", "endTime", "phase", "category"),
+        )
  
          # Verify we have different marker types
-        marker_name_set = set(marker_names)
+        marker_name_set = set(gecko_marker_names(profile_data, markers))
  
          # Should have "Has GIL" markers (when thread had GIL)
          self.assertIn(
@@ -2659,6 +2773,7 @@ class TestGeckoOpcodeMarkers(unittest.TestCase):
      def test_gecko_opcode_state_tracking(self):
          """Test that GeckoCollector tracks opcode state changes."""
          collector = GeckoCollector(sample_interval_usec=1000, opcodes=True)
+        self.addCleanup(collector._cleanup_spills)
  
          # First sample with opcode 90 (RAISE_VARARGS)
          frame1 = MockFrameInfo("test.py", 10, "func", opcode=90)
@@ -2702,10 +2817,28 @@ class TestGeckoOpcodeMarkers(unittest.TestCase):
          collector.collect(frames2)
  
          # Should have emitted a marker for the first opcode
-        thread_data = collector.threads[1]
-        markers = thread_data["markers"]
-        # At least one marker should have been added
-        self.assertGreater(len(markers["name"]), 0)
+        profile = collector._build_profile()
+        markers = profile["threads"][0]["markers"]
+        assert_gecko_column_lengths(
+            self, markers,
+            ("data", "name", "startTime", "endTime", "phase", "category"),
+        )
+        opcode_markers = gecko_opcode_marker_data(profile)
+        self.assertIn(
+            {
+                "opcode": 90,
+                "line": 10,
+                "function": "func",
+            },
+            [
+                {
+                    "opcode": marker["opcode"],
+                    "line": marker["line"],
+                    "function": marker["function"],
+                }
+                for marker in opcode_markers
+            ],
+        )
  
      def test_gecko_opcode_markers_not_emitted_when_disabled(self):
          """Test that no opcode markers when opcodes=False."""
@@ -2729,8 +2862,9 @@ class TestGeckoOpcodeMarkers(unittest.TestCase):
          ]
          collector.collect(frames2)
  
-        # opcode_state should not be tracked
-        self.assertEqual(len(collector.opcode_state), 0)
+        profile = collector._build_profile()
+        self.assertEqual(gecko_opcode_marker_data(profile), [])
+        self.assertEqual(profile["meta"]["markerSchema"], [])
  
      def test_gecko_opcode_with_none_opcode(self):
          """Test that None opcode doesn't cause issues."""
@@ -2746,9 +2880,8 @@ class TestGeckoOpcodeMarkers(unittest.TestCase):
          ]
          collector.collect(frames)
  
-        # Should track the state but opcode is None
-        self.assertIn(1, collector.opcode_state)
-        self.assertIsNone(collector.opcode_state[1][0])
+        profile = collector._build_profile()
+        self.assertEqual(gecko_opcode_marker_data(profile), [])
  
  
  class TestCollectorFrameFormat(unittest.TestCase):
diff --git a/Misc/NEWS.d/next/Library/2026-06-03-13-51-29.gh-issue-150662.ELT8Vg.rst b/Misc/NEWS.d/next/Library/2026-06-03-13-51-29.gh-issue-150662.ELT8Vg.rst

new file mode 100644 (file)

index 0000000..42ed6ad
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-06-03-13-51-29.gh-issue-150662.ELT8Vg.rst
@@ -0,0 +1,4 @@
+Fix the ``--gecko`` collector in :mod:`profiling.sampling` that kept every
+sample in memory. It now writes sample and marker data to temporary files
+and reads them back, ultimately building the output file at the end. Patch
+by Pablo Galindo and Maurycy Pawłowski-Wieroński.
author	Maurycy Pawłowski-Wieroński <maurycy@maurycy.com>
	Sat, 6 Jun 2026 02:27:41 +0000 (04:27 +0200)
committer	GitHub <noreply@github.com>
	Sat, 6 Jun 2026 02:27:41 +0000 (02:27 +0000)
Lib/profiling/sampling/gecko_collector.py		patch \| blob \| blame \| history
Lib/test/test_profiling/test_sampling_profiler/test_collectors.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2026-06-03-13-51-29.gh-issue-150662.ELT8Vg.rst	[new file with mode: 0644]	patch \| blob