From: Maurycy Pawłowski-Wieroński <maurycy@maurycy.com>
Date: Sat, 6 Jun 2026 02:27:41 +0000 (+0200)
Subject: gh-150662: Stop unbounded memory growth in Tachyon `--gecko` collector (#150845)
X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=785b6dcf71b08c34a1d0a1cff9dc4afe997ad283;p=thirdparty%2FPython%2Fcpython.git

gh-150662: Stop unbounded memory growth in Tachyon `--gecko` collector (#150845)
---

diff --git a/Lib/profiling/sampling/gecko_collector.py b/Lib/profiling/sampling/gecko_collector.py
index 54392af95000..361f6037f216 100644
--- a/Lib/profiling/sampling/gecko_collector.py
+++ b/Lib/profiling/sampling/gecko_collector.py
@@ -1,8 +1,10 @@
 import itertools
+import io
 import json
 import os
 import platform
 import sys
+import tempfile
 import threading
 import time
 
@@ -61,6 +63,77 @@ FRAME_INLINE_DEPTH_ROOT = 0
 PROCESS_TYPE_MAIN = 0
 STACKWALK_DISABLED = 0
 
+# In-memory buffer before spilling to disk
+DEFAULT_SPILL_BUFFER_BYTES = 128 * 1024
+_JSON_SEPARATORS = (",", ":")
+_JSON_ENCODER = json.JSONEncoder(
+    separators=_JSON_SEPARATORS, allow_nan=False
+)
+
+
+class SpillColumn:
+    def __init__(self, directory, basename, *,
+                 buffer_bytes=None):
+        self.path = os.path.join(directory, basename)
+        self.buffer = bytearray()
+        self._buffer_bytes = (
+            DEFAULT_SPILL_BUFFER_BYTES if buffer_bytes is None
+            else buffer_bytes
+        )
+
+    def append(self, value):
+        self.buffer += (_JSON_ENCODER.encode(value) + "\n").encode("utf-8")
+        if len(self.buffer) >= self._buffer_bytes:
+            self.flush()
+
+    def flush(self):
+        with open(self.path, "ab") as file:
+            file.write(self.buffer)
+        self.buffer.clear()
+
+    def iter_tokens(self):
+        with open(self.path, encoding="utf-8") as file:
+            for line in file:
+                yield line.rstrip("\n")
+
+
+class GeckoThreadSpill:
+    _COLUMNS = (
+        ("samples_stack", "samples-stack.json"),
+        ("samples_time", "samples-time.json"),
+        ("markers_name", "markers-name.json"),
+        ("markers_start_time", "markers-start-time.json"),
+        ("markers_end_time", "markers-end-time.json"),
+        ("markers_phase", "markers-phase.json"),
+        ("markers_category", "markers-category.json"),
+        ("markers_data", "markers-data.json"),
+    )
+
+    def __init__(self, directory, tid):
+        prefix = f"thread-{tid}-"
+        for attr, basename in self._COLUMNS:
+            setattr(self, attr, SpillColumn(directory, prefix + basename))
+        self.sample_count = 0
+        self.marker_count = 0
+
+    def append_sample(self, stack_index, time_ms):
+        self.samples_stack.append(stack_index)
+        self.samples_time.append(time_ms)
+        self.sample_count += 1
+
+    def append_marker(self, name_idx, start_time, end_time, phase, category, data):
+        self.markers_name.append(name_idx)
+        self.markers_start_time.append(start_time)
+        self.markers_end_time.append(end_time)
+        self.markers_phase.append(phase)
+        self.markers_category.append(category)
+        self.markers_data.append(data)
+        self.marker_count += 1
+
+    def prepare_read(self):
+        for attr, _basename in self._COLUMNS:
+            getattr(self, attr).flush()
+
 
 class GeckoCollector(Collector):
     aggregating = True
@@ -77,6 +150,8 @@ class GeckoCollector(Collector):
 
         # Per-thread data structures
         self.threads = {}  # tid -> thread data
+        self.spill_dir = None
+        self.exported = False
 
         # Global tables
         self.libs = []
@@ -151,6 +226,9 @@ class GeckoCollector(Collector):
             stack_frames: List of interpreter/thread frame info
             timestamps_us: List of timestamps in microseconds (None for live sampling)
         """
+        if self.exported:
+            raise RuntimeError("cannot append to GeckoCollector after export")
+
         # Handle live sampling (no timestamps provided)
         if timestamps_us is None:
             current_time = (time.monotonic() * 1000) - self.start_time
@@ -259,15 +337,9 @@ class GeckoCollector(Collector):
                 stack_index = self._process_stack(thread_data, frames)
 
                 # Add samples with timestamps
-                samples = thread_data["samples"]
-                samples_stack = samples["stack"]
-                samples_time = samples["time"]
-                samples_delay = samples["eventDelay"]
-
+                thread_spill = thread_data["_spill"]
                 for t in times:
-                    samples_stack.append(stack_index)
-                    samples_time.append(t)
-                    samples_delay.append(None)
+                    thread_spill.append_sample(stack_index, t)
 
                 # Handle opcodes
                 if self.opcodes_enabled and frames:
@@ -294,6 +366,8 @@ class GeckoCollector(Collector):
 
     def _create_thread(self, tid, is_main_thread):
         """Create a new thread structure with processed profile format."""
+        if self.spill_dir is None:
+            self.spill_dir = tempfile.TemporaryDirectory()
 
         thread = {
             "name": f"Thread-{tid}",
@@ -307,15 +381,6 @@ class GeckoCollector(Collector):
             "tid": tid,
             "processType": "default",
             "processName": "Python Process",
-            # Sample data - processed format with direct arrays
-            "samples": {
-                "stack": [],
-                "time": [],
-                "eventDelay": [],
-                "weight": None,
-                "weightType": "samples",
-                "length": 0,  # Will be updated on export
-            },
             # Stack table - processed format
             "stackTable": {
                 "frame": [],
@@ -366,21 +431,12 @@ class GeckoCollector(Collector):
                 "functionSize": [],
                 "length": 0,
             },
-            # Markers - processed format (arrays)
-            "markers": {
-                "data": [],
-                "name": [],
-                "startTime": [],
-                "endTime": [],
-                "phase": [],
-                "category": [],
-                "length": 0,
-            },
             # Caches for deduplication
             "_stackCache": {},
             "_frameCache": {},
             "_funcCache": {},
             "_resourceCache": {},
+            "_spill": GeckoThreadSpill(self.spill_dir.name, tid),
         }
 
         return thread
@@ -405,51 +461,42 @@ class GeckoCollector(Collector):
         if tid not in self.threads:
             return
 
-        thread_data = self.threads[tid]
         duration = end_time - start_time
 
         name_idx = self._intern_string(name)
-        markers = thread_data["markers"]
-        markers["name"].append(name_idx)
-        markers["startTime"].append(start_time)
-        markers["endTime"].append(end_time)
-        markers["phase"].append(1)  # 1 = interval marker
-        markers["category"].append(category)
-        markers["data"].append({
-            "type": name.replace(" ", ""),
-            "duration": duration,
-            "tid": tid
-        })
-
-    def _add_opcode_interval_marker(self, tid, opcode, lineno, col_offset, funcname, start_time, end_time):
+        self.threads[tid]["_spill"].append_marker(
+            name_idx, start_time, end_time, 1, category, {
+                "type": name.replace(" ", ""),
+                "duration": duration,
+                "tid": tid,
+            }
+        )
+
+    def _add_opcode_interval_marker(self, tid, opcode, lineno, col_offset,
+                                    funcname, start_time, end_time):
         """Add an interval marker for opcode execution span."""
         if tid not in self.threads or opcode is None:
             return
 
-        thread_data = self.threads[tid]
         opcode_info = get_opcode_info(opcode)
         # Use formatted opcode name (with base opcode for specialized ones)
         formatted_opname = format_opcode(opcode)
 
         name_idx = self._intern_string(formatted_opname)
 
-        markers = thread_data["markers"]
-        markers["name"].append(name_idx)
-        markers["startTime"].append(start_time)
-        markers["endTime"].append(end_time)
-        markers["phase"].append(1)  # 1 = interval marker
-        markers["category"].append(CATEGORY_OPCODES)
-        markers["data"].append({
-            "type": "Opcode",
-            "opcode": opcode,
-            "opname": formatted_opname,
-            "base_opname": opcode_info["base_opname"],
-            "is_specialized": opcode_info["is_specialized"],
-            "line": lineno,
-            "column": col_offset if col_offset >= 0 else None,
-            "function": funcname,
-            "duration": end_time - start_time,
-        })
+        self.threads[tid]["_spill"].append_marker(
+            name_idx, start_time, end_time, 1, CATEGORY_OPCODES, {
+                "type": "Opcode",
+                "opcode": opcode,
+                "opname": formatted_opname,
+                "base_opname": opcode_info["base_opname"],
+                "is_specialized": opcode_info["is_specialized"],
+                "line": lineno,
+                "column": col_offset if col_offset >= 0 else None,
+                "function": funcname,
+                "duration": end_time - start_time,
+            }
+        )
 
     def _process_stack(self, thread_data, frames):
         """Process a stack and return the stack index."""
@@ -660,7 +707,6 @@ class GeckoCollector(Collector):
 
     def export(self, filename):
         """Export the profile to a Gecko JSON file."""
-
         if self.sample_count > 0 and self.last_sample_time > 0:
             self.interval = self.last_sample_time / self.sample_count
 
@@ -681,19 +727,30 @@ class GeckoCollector(Collector):
         spinner_thread = threading.Thread(target=spin, daemon=True)
         spinner_thread.start()
 
+        temp_path = None
+        replaced = False
         try:
-            # Finalize any open markers before building profile
-            self._finalize_markers()
-
-            profile = self._build_profile()
-
-            with open(filename, "w") as f:
-                json.dump(profile, f, separators=(",", ":"))
+            self._prepare_for_serialization()
+            output_dir = os.path.dirname(os.path.abspath(filename)) or "."
+            with tempfile.NamedTemporaryFile(
+                "w", dir=output_dir, delete=False
+            ) as file:
+                temp_path = file.name
+                self._stream_profile(file)
+            os.replace(temp_path, filename)
+            replaced = True
         finally:
+            self.exported = True
             stop_spinner.set()
             spinner_thread.join(timeout=1.0)
             # Small delay to ensure the clear happens
             time.sleep(0.01)
+            if temp_path is not None and not replaced:
+                try:
+                    os.unlink(temp_path)
+                except FileNotFoundError:
+                    pass
+            self._cleanup_spills()
 
         print(f"Gecko profile written to {filename}")
         print(
@@ -727,34 +784,17 @@ class GeckoCollector(Collector):
 
     def _build_profile(self):
         """Build the complete profile structure in processed format."""
-        # Convert thread data to final format
-        threads = []
-
-        for tid, thread_data in self.threads.items():
-            # Update lengths
-            samples = thread_data["samples"]
-            stack_table = thread_data["stackTable"]
-            frame_table = thread_data["frameTable"]
-            func_table = thread_data["funcTable"]
-            resource_table = thread_data["resourceTable"]
-
-            samples["length"] = len(samples["stack"])
-            stack_table["length"] = len(stack_table["frame"])
-            frame_table["length"] = len(frame_table["func"])
-            func_table["length"] = len(func_table["name"])
-            resource_table["length"] = len(resource_table["name"])
-            thread_data["markers"]["length"] = len(thread_data["markers"]["name"])
-
-            # Clean up internal caches
-            del thread_data["_stackCache"]
-            del thread_data["_frameCache"]
-            del thread_data["_funcCache"]
-            del thread_data["_resourceCache"]
-
-            threads.append(thread_data)
-
-        # Main profile structure in processed format
-        profile = {
+        try:
+            self._prepare_for_serialization()
+            file = io.StringIO()
+            self._stream_profile(file)
+            return json.loads(file.getvalue())
+        finally:
+            self.exported = True
+            self._cleanup_spills()
+
+    def _profile_head(self):
+        return {
             "meta": {
                 "interval": self.interval,
                 "startTime": self.start_time,
@@ -784,7 +824,10 @@ class GeckoCollector(Collector):
                 },
             },
             "libs": self.libs,
-            "threads": threads,
+        }
+
+    def _profile_tail(self):
+        return {
             "pages": [],
             "shared": {
                 "stringArray": self.global_strings,
@@ -792,4 +835,146 @@ class GeckoCollector(Collector):
             },
         }
 
-        return profile
+    def _prepare_for_serialization(self):
+        if self.exported:
+            raise RuntimeError("GeckoCollector has already been exported")
+        self._finalize_markers()
+        for thread_data in self.threads.values():
+            thread_data["_spill"].prepare_read()
+            thread_data["stackTable"]["length"] = len(thread_data["stackTable"]["frame"])
+            thread_data["frameTable"]["length"] = len(thread_data["frameTable"]["func"])
+            thread_data["funcTable"]["length"] = len(thread_data["funcTable"]["name"])
+            thread_data["resourceTable"]["length"] = len(thread_data["resourceTable"]["name"])
+
+    def _cleanup_spills(self):
+        if self.spill_dir is not None:
+            self.spill_dir.cleanup()
+            self.spill_dir = None
+
+    def _stream_profile(self, file):
+        file.write("{")
+        first = True
+        for key, value in self._profile_head().items():
+            first = _write_json_member(file, key, value, first)
+
+        first = _write_member_name(file, "threads", first)
+        file.write("[")
+        for index, (tid, thread_data) in enumerate(self.threads.items()):
+            if index:
+                file.write(",")
+            self._stream_thread(file, tid, thread_data)
+        file.write("]")
+
+        for key, value in self._profile_tail().items():
+            first = _write_json_member(file, key, value, first)
+        file.write("}")
+
+    def _stream_thread(self, file, tid, thread_data):
+        spill = thread_data["_spill"]
+        metadata = {
+            "name": thread_data["name"],
+            "isMainThread": thread_data["isMainThread"],
+            "processStartupTime": thread_data["processStartupTime"],
+            "processShutdownTime": thread_data["processShutdownTime"],
+            "registerTime": thread_data["registerTime"],
+            "unregisterTime": thread_data["unregisterTime"],
+            "pausedRanges": thread_data["pausedRanges"],
+            "pid": thread_data["pid"],
+            "tid": thread_data["tid"],
+            "processType": thread_data["processType"],
+            "processName": thread_data["processName"],
+        }
+        file.write("{")
+        first = True
+        for key, value in metadata.items():
+            first = _write_json_member(file, key, value, first)
+
+        first = _write_member_name(file, "samples", first)
+        self._stream_samples(file, spill)
+        for key in (
+            "stackTable",
+            "frameTable",
+            "funcTable",
+            "resourceTable",
+            "nativeSymbols",
+        ):
+            first = _write_json_member(file, key, thread_data[key], first)
+        first = _write_member_name(file, "markers", first)
+        self._stream_markers(file, spill)
+        file.write("}")
+
+    def _stream_samples(self, file, spill):
+        _stream_column_table(
+            file,
+            (
+                ("stack", spill.samples_stack.iter_tokens()),
+                ("time", spill.samples_time.iter_tokens()),
+                ("eventDelay", ("null" for _ in range(spill.sample_count))),
+            ),
+            spill.sample_count,
+            (
+                ("weight", None),
+                ("weightType", "samples"),
+                ("length", spill.sample_count),
+            ),
+        )
+
+    def _stream_markers(self, file, spill):
+        _stream_column_table(
+            file,
+            (
+                ("data", spill.markers_data.iter_tokens()),
+                ("name", spill.markers_name.iter_tokens()),
+                ("startTime", spill.markers_start_time.iter_tokens()),
+                ("endTime", spill.markers_end_time.iter_tokens()),
+                ("phase", spill.markers_phase.iter_tokens()),
+                ("category", spill.markers_category.iter_tokens()),
+            ),
+            spill.marker_count,
+            (("length", spill.marker_count),),
+        )
+
+
+def _write_json(file, value):
+    for chunk in _JSON_ENCODER.iterencode(value):
+        file.write(chunk)
+
+
+def _write_member_name(file, name, first):
+    if not first:
+        file.write(",")
+    _write_json(file, name)
+    file.write(":")
+    return False
+
+
+def _write_json_member(file, name, value, first):
+    first = _write_member_name(file, name, first)
+    _write_json(file, value)
+    return first
+
+
+def _stream_column_table(file, columns, expected_count, trailing_members=()):
+    file.write("{")
+    first = True
+    for name, token_iter in columns:
+        first = _write_member_name(file, name, first)
+        _stream_array(file, token_iter, expected_count, name)
+    for name, value in trailing_members:
+        first = _write_json_member(file, name, value, first)
+    file.write("}")
+
+
+def _stream_array(file, token_iter, expected_count, label="array"):
+    file.write("[")
+    count = 0
+    for token in token_iter:
+        if count:
+            file.write(",")
+        file.write(token)
+        count += 1
+    if count != expected_count:
+        raise RuntimeError(
+            f"streamed {count} {label} items, expected {expected_count}"
+        )
+    file.write("]")
diff --git a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
index 390a1479fdd2..1ab31af67fec 100644
--- a/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
+++ b/Lib/test/test_profiling/test_sampling_profiler/test_collectors.py
@@ -11,6 +11,7 @@ from test.support import is_emscripten
 
 try:
     import _remote_debugging  # noqa: F401
+    from profiling.sampling import gecko_collector
     from profiling.sampling.pstats_collector import PstatsCollector
     from profiling.sampling.stack_collector import (
         CollapsedStackCollector,
@@ -59,6 +60,42 @@ def find_child_by_name(children, strings, substr):
     return None
 
 
+def export_gecko_profile(testcase, collector):
+    gecko_out = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
+    testcase.addCleanup(close_and_unlink, gecko_out)
+    # We cannot overwrite an open file on Windows.
+    gecko_out.close()
+
+    with captured_stdout(), captured_stderr():
+        collector.export(gecko_out.name)
+
+    testcase.assertGreater(os.path.getsize(gecko_out.name), 0)
+    with open(gecko_out.name, encoding="utf-8") as file:
+        return json.load(file)
+
+
+def assert_gecko_column_lengths(testcase, table, columns):
+    expected = table["length"]
+    for column in columns:
+        testcase.assertEqual(
+            len(table[column]), expected,
+            f"{column!r} has wrong length",
+        )
+
+
+def gecko_marker_names(profile, markers):
+    string_array = profile["shared"]["stringArray"]
+    return [string_array[idx] for idx in markers["name"]]
+
+
+def gecko_opcode_marker_data(profile):
+    markers = profile["threads"][0]["markers"]
+    return [
+        data for data in markers["data"]
+        if data.get("type") == "Opcode"
+    ]
+
+
 class TestSampleProfilerComponents(unittest.TestCase):
     """Unit tests for individual profiler components."""
 
@@ -583,9 +620,10 @@ class TestSampleProfilerComponents(unittest.TestCase):
 
         # Verify samples
         samples = thread_data["samples"]
-        self.assertEqual(len(samples["stack"]), 1)
-        self.assertEqual(len(samples["time"]), 1)
         self.assertEqual(samples["length"], 1)
+        assert_gecko_column_lengths(
+            self, samples, ("stack", "time", "eventDelay")
+        )
 
         # Verify function table structure and content
         func_table = thread_data["funcTable"]
@@ -622,9 +660,6 @@ class TestSampleProfilerComponents(unittest.TestCase):
     @unittest.skipIf(is_emscripten, "threads not available")
     def test_gecko_collector_export(self):
         """Test Gecko profile export functionality."""
-        gecko_out = tempfile.NamedTemporaryFile(suffix=".json", delete=False)
-        self.addCleanup(close_and_unlink, gecko_out)
-
         collector = GeckoCollector(1000)
 
         test_frames1 = [
@@ -657,17 +692,7 @@ class TestSampleProfilerComponents(unittest.TestCase):
         collector.collect(test_frames2)
         collector.collect(test_frames3)
 
-        # Export gecko profile
-        with captured_stdout(), captured_stderr():
-            collector.export(gecko_out.name)
-
-        # Verify file was created and contains valid data
-        self.assertTrue(os.path.exists(gecko_out.name))
-        self.assertGreater(os.path.getsize(gecko_out.name), 0)
-
-        # Check file contains valid JSON
-        with open(gecko_out.name, "r") as f:
-            profile_data = json.load(f)
+        profile_data = export_gecko_profile(self, collector)
 
         # Should be valid Gecko profile format
         self.assertIn("meta", profile_data)
@@ -688,6 +713,100 @@ class TestSampleProfilerComponents(unittest.TestCase):
         self.assertIn("func2", string_array)
         self.assertIn("other_func", string_array)
 
+        thread_data = profile_data["threads"][0]
+        assert_gecko_column_lengths(
+            self, thread_data["samples"], ("stack", "time", "eventDelay")
+        )
+
+    @unittest.skipIf(is_emscripten, "threads not available")
+    def test_gecko_collector_export_after_spill_flush(self):
+        """Test Gecko profile export after spill buffers flush to disk."""
+        old_buffer_bytes = gecko_collector.DEFAULT_SPILL_BUFFER_BYTES
+        gecko_collector.DEFAULT_SPILL_BUFFER_BYTES = 1
+        self.addCleanup(
+            setattr, gecko_collector, "DEFAULT_SPILL_BUFFER_BYTES",
+            old_buffer_bytes
+        )
+
+        collector = GeckoCollector(1000)
+        test_frames = [
+            MockInterpreterInfo(
+                0,
+                [
+                    MockThreadInfo(
+                        1,
+                        [MockFrameInfo("file.py", 10, "func")],
+                        status=THREAD_STATUS_HAS_GIL,
+                    )
+                ],
+            )
+        ]
+        collector.collect(test_frames, timestamps_us=[1000, 2000, 3000])
+
+        profile_data = export_gecko_profile(self, collector)
+        samples = profile_data["threads"][0]["samples"]
+        self.assertEqual(samples["length"], 3)
+        assert_gecko_column_lengths(
+            self, samples, ("stack", "time", "eventDelay")
+        )
+
+    @unittest.skipIf(is_emscripten, "threads not available")
+    def test_gecko_collector_rejects_collect_after_export(self):
+        collector = GeckoCollector(1000)
+        test_frames = [
+            MockInterpreterInfo(
+                0,
+                [
+                    MockThreadInfo(
+                        1,
+                        [MockFrameInfo("file.py", 10, "func")],
+                        status=THREAD_STATUS_HAS_GIL,
+                    )
+                ],
+            )
+        ]
+        collector.collect(test_frames)
+        export_gecko_profile(self, collector)
+
+        with self.assertRaisesRegex(RuntimeError, "after export"):
+            collector.collect(test_frames)
+
+    @unittest.skipIf(is_emscripten, "threads not available")
+    def test_gecko_collector_export_failure_keeps_existing_file(self):
+        collector = GeckoCollector(1000)
+        test_frames = [
+            MockInterpreterInfo(
+                0,
+                [
+                    MockThreadInfo(
+                        1,
+                        [MockFrameInfo("file.py", 10, "func")],
+                        status=THREAD_STATUS_HAS_GIL,
+                    )
+                ],
+            )
+        ]
+        collector.collect(test_frames)
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            filename = os.path.join(temp_dir, "profile.json")
+            with open(filename, "w", encoding="utf-8") as file:
+                file.write("existing")
+
+            before = set(os.listdir(temp_dir))
+
+            def fail(file):
+                raise OSError("boom")
+
+            collector._stream_profile = fail
+            with captured_stdout(), captured_stderr():
+                with self.assertRaisesRegex(OSError, "boom"):
+                    collector.export(filename)
+
+            with open(filename, encoding="utf-8") as file:
+                self.assertEqual(file.read(), "existing")
+            self.assertEqual(set(os.listdir(temp_dir)), before)
+
     def test_gecko_collector_markers(self):
         """Test Gecko profile markers for GIL and CPU state tracking."""
         collector = GeckoCollector(1000)
@@ -771,21 +890,16 @@ class TestSampleProfilerComponents(unittest.TestCase):
         self.assertIn("markers", thread_data)
         markers = thread_data["markers"]
 
-        # Should have marker arrays
-        self.assertIn("name", markers)
-        self.assertIn("startTime", markers)
-        self.assertIn("endTime", markers)
-        self.assertIn("category", markers)
         self.assertGreater(
             markers["length"], 0, "Should have generated markers"
         )
-
-        # Get marker names from string table
-        string_array = profile_data["shared"]["stringArray"]
-        marker_names = [string_array[idx] for idx in markers["name"]]
+        assert_gecko_column_lengths(
+            self, markers,
+            ("data", "name", "startTime", "endTime", "phase", "category"),
+        )
 
         # Verify we have different marker types
-        marker_name_set = set(marker_names)
+        marker_name_set = set(gecko_marker_names(profile_data, markers))
 
         # Should have "Has GIL" markers (when thread had GIL)
         self.assertIn(
@@ -2659,6 +2773,7 @@ class TestGeckoOpcodeMarkers(unittest.TestCase):
     def test_gecko_opcode_state_tracking(self):
         """Test that GeckoCollector tracks opcode state changes."""
         collector = GeckoCollector(sample_interval_usec=1000, opcodes=True)
+        self.addCleanup(collector._cleanup_spills)
 
         # First sample with opcode 90 (RAISE_VARARGS)
         frame1 = MockFrameInfo("test.py", 10, "func", opcode=90)
@@ -2702,10 +2817,28 @@ class TestGeckoOpcodeMarkers(unittest.TestCase):
         collector.collect(frames2)
 
         # Should have emitted a marker for the first opcode
-        thread_data = collector.threads[1]
-        markers = thread_data["markers"]
-        # At least one marker should have been added
-        self.assertGreater(len(markers["name"]), 0)
+        profile = collector._build_profile()
+        markers = profile["threads"][0]["markers"]
+        assert_gecko_column_lengths(
+            self, markers,
+            ("data", "name", "startTime", "endTime", "phase", "category"),
+        )
+        opcode_markers = gecko_opcode_marker_data(profile)
+        self.assertIn(
+            {
+                "opcode": 90,
+                "line": 10,
+                "function": "func",
+            },
+            [
+                {
+                    "opcode": marker["opcode"],
+                    "line": marker["line"],
+                    "function": marker["function"],
+                }
+                for marker in opcode_markers
+            ],
+        )
 
     def test_gecko_opcode_markers_not_emitted_when_disabled(self):
         """Test that no opcode markers when opcodes=False."""
@@ -2729,8 +2862,9 @@ class TestGeckoOpcodeMarkers(unittest.TestCase):
         ]
         collector.collect(frames2)
 
-        # opcode_state should not be tracked
-        self.assertEqual(len(collector.opcode_state), 0)
+        profile = collector._build_profile()
+        self.assertEqual(gecko_opcode_marker_data(profile), [])
+        self.assertEqual(profile["meta"]["markerSchema"], [])
 
     def test_gecko_opcode_with_none_opcode(self):
         """Test that None opcode doesn't cause issues."""
@@ -2746,9 +2880,8 @@ class TestGeckoOpcodeMarkers(unittest.TestCase):
         ]
         collector.collect(frames)
 
-        # Should track the state but opcode is None
-        self.assertIn(1, collector.opcode_state)
-        self.assertIsNone(collector.opcode_state[1][0])
+        profile = collector._build_profile()
+        self.assertEqual(gecko_opcode_marker_data(profile), [])
 
 
 class TestCollectorFrameFormat(unittest.TestCase):
diff --git a/Misc/NEWS.d/next/Library/2026-06-03-13-51-29.gh-issue-150662.ELT8Vg.rst b/Misc/NEWS.d/next/Library/2026-06-03-13-51-29.gh-issue-150662.ELT8Vg.rst
new file mode 100644
index 000000000000..42ed6ad7cd3c
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-06-03-13-51-29.gh-issue-150662.ELT8Vg.rst
@@ -0,0 +1,4 @@
+Fix the ``--gecko`` collector in :mod:`profiling.sampling` that kept every
+sample in memory. It now writes sample and marker data to temporary files
+and reads them back, ultimately building the output file at the end. Patch
+by Pablo Galindo and Maurycy PawÅowski-WieroÅski.