GH-109373: Store metadata required for pystats comparison in the JSON (GH-109374)

author Michael Droettboom <mdboom@gmail.com>

Fri, 15 Sep 2023 20:10:46 +0000 (16:10 -0400)

committer GitHub <noreply@github.com>

Fri, 15 Sep 2023 20:10:46 +0000 (13:10 -0700)
author Michael Droettboom <mdboom@gmail.com>
Fri, 15 Sep 2023 20:10:46 +0000 (16:10 -0400)
committer GitHub <noreply@github.com>
Fri, 15 Sep 2023 20:10:46 +0000 (13:10 -0700)
diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py

index 484dfe8a2b7dbde3d4772ab086347488d4df0203..3b2bdd8015be4ab17c18b1413855dcbe6b417626 100644 (file)
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -2,11 +2,14 @@
  default stats folders.
  """
  
+# NOTE: Bytecode introspection modules (opcode, dis, etc.) should only
+# happen when loading a single dataset. When comparing datasets, it
+# could get it wrong, leading to subtle errors.
+
  import argparse
  import collections
  import json
  import os.path
-import opcode
  from datetime import date
  import itertools
  import sys
@@ -28,6 +31,16 @@ def format_ratio(num, den):
      else:
          return f"{num/den:.01%}"
  
+def percentage_to_float(s):
+    """
+    Converts a percentage string to a float.  The empty string is returned as 0.0
+    """
+    if s == "":
+        return 0.0
+    else:
+        assert s[-1] == "%"
+        return float(s[:-1])
+
  def join_rows(a_rows, b_rows):
      """
      Joins two tables together, side-by-side, where the first column in each is a
@@ -164,7 +177,12 @@ def gather_stats(input):
  
      if os.path.isfile(input):
          with open(input, "r") as fd:
-            return json.load(fd)
+            stats = json.load(fd)
+
+        stats["_stats_defines"] = {int(k): v for k, v in stats["_stats_defines"].items()}
+        stats["_defines"] = {int(k): v for k, v in stats["_defines"].items()}
+        return stats
+
      elif os.path.isdir(input):
          stats = collections.Counter()
          for filename in os.listdir(input):
@@ -179,6 +197,16 @@ def gather_stats(input):
                      value = int(value)
                      stats[key] += value
              stats['__nfiles__'] += 1
+
+        import opcode
+
+        stats["_specialized_instructions"] = [
+            op for op in opcode._specialized_opmap.keys()
+            if "__" not in op
+        ]
+        stats["_stats_defines"] = get_stats_defines()
+        stats["_defines"] = get_defines()
+
          return stats
      else:
          raise ValueError(f"{input:r} is not a file or directory path")
@@ -223,13 +251,10 @@ def kind_to_text(kind, defines, opname):
              return pretty(name[len(opname)+1:])
      return "kind " + str(kind)
  
-def categorized_counts(opcode_stats):
+def categorized_counts(opcode_stats, specialized_instructions):
      basic = 0
      specialized = 0
      not_specialized = 0
-    specialized_instructions = {
-        op for op in opcode._specialized_opmap.keys()
-        if "__" not in op}
      for name, opcode_stat in opcode_stats.items():
          if "execution_count" not in opcode_stat:
              continue
@@ -348,7 +373,7 @@ def emit_comparative_execution_counts(
                  (opcode, base_entry[0], head_entry[0],
                   f"{100*change:0.1f}%"))
  
-        rows.sort(key=lambda x: -abs(float(x[-1][:-1])))
+        rows.sort(key=lambda x: -abs(percentage_to_float(x[-1])))
  
          emit_table(
              ("Name", "Base Count:", "Head Count:", "Change:"),
@@ -361,14 +386,12 @@ def get_defines():
          defines = parse_kinds(spec_src)
      return defines
  
-def emit_specialization_stats(opcode_stats):
-    defines = get_defines()
+def emit_specialization_stats(opcode_stats, defines):
      with Section("Specialization stats", summary="specialization stats by family"):
          for name, opcode_stat in opcode_stats.items():
              print_specialization_stats(name, opcode_stat, defines)
  
-def emit_comparative_specialization_stats(base_opcode_stats, head_opcode_stats):
-    defines = get_defines()
+def emit_comparative_specialization_stats(base_opcode_stats, head_opcode_stats, defines):
      with Section("Specialization stats", summary="specialization stats by family"):
          opcodes = set(base_opcode_stats.keys()) & set(head_opcode_stats.keys())
          for opcode in opcodes:
@@ -376,17 +399,21 @@ def emit_comparative_specialization_stats(base_opcode_stats, head_opcode_stats):
                  opcode, base_opcode_stats[opcode], head_opcode_stats[opcode], defines
              )
  
-def calculate_specialization_effectiveness(opcode_stats, total):
-    basic, not_specialized, specialized = categorized_counts(opcode_stats)
+def calculate_specialization_effectiveness(
+    opcode_stats, total, specialized_instructions
+):
+    basic, not_specialized, specialized = categorized_counts(
+        opcode_stats, specialized_instructions
+    )
      return [
          ("Basic", basic, format_ratio(basic, total)),
          ("Not specialized", not_specialized, format_ratio(not_specialized, total)),
          ("Specialized", specialized, format_ratio(specialized, total)),
      ]
  
-def emit_specialization_overview(opcode_stats, total):
+def emit_specialization_overview(opcode_stats, total, specialized_instructions):
      with Section("Specialization effectiveness"):
-        rows = calculate_specialization_effectiveness(opcode_stats, total)
+        rows = calculate_specialization_effectiveness(opcode_stats, total, specialized_instructions)
          emit_table(("Instructions", "Count:", "Ratio:"), rows)
          for title, field in (("Deferred", "specialization.deferred"), ("Misses", "specialization.miss")):
              total = 0
@@ -404,10 +431,16 @@ def emit_specialization_overview(opcode_stats, total):
                      rows = [ (name, count, format_ratio(count, total)) for (count, name) in counts[:10] ]
                      emit_table(("Name", "Count:", "Ratio:"), rows)
  
-def emit_comparative_specialization_overview(base_opcode_stats, base_total, head_opcode_stats, head_total):
+def emit_comparative_specialization_overview(
+    base_opcode_stats, base_total, head_opcode_stats, head_total, specialized_instructions
+):
      with Section("Specialization effectiveness"):
-        base_rows = calculate_specialization_effectiveness(base_opcode_stats, base_total)
-        head_rows = calculate_specialization_effectiveness(head_opcode_stats, head_total)
+        base_rows = calculate_specialization_effectiveness(
+            base_opcode_stats, base_total, specialized_instructions
+        )
+        head_rows = calculate_specialization_effectiveness(
+            head_opcode_stats, head_total, specialized_instructions
+        )
          emit_table(
              ("Instructions", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"),
              join_rows(base_rows, head_rows)
@@ -419,8 +452,7 @@ def get_stats_defines():
          defines = parse_kinds(stats_src, prefix="EVAL_CALL")
      return defines
  
-def calculate_call_stats(stats):
-    defines = get_stats_defines()
+def calculate_call_stats(stats, defines):
      total = 0
      for key, value in stats.items():
          if "Calls to" in key:
@@ -439,17 +471,17 @@ def calculate_call_stats(stats):
              rows.append((key, value, format_ratio(value, total)))
      return rows
  
-def emit_call_stats(stats):
+def emit_call_stats(stats, defines):
      with Section("Call stats", summary="Inlined calls and frame stats"):
-        rows = calculate_call_stats(stats)
+        rows = calculate_call_stats(stats, defines)
          emit_table(("", "Count:", "Ratio:"), rows)
  
-def emit_comparative_call_stats(base_stats, head_stats):
+def emit_comparative_call_stats(base_stats, head_stats, defines):
      with Section("Call stats", summary="Inlined calls and frame stats"):
-        base_rows = calculate_call_stats(base_stats)
-        head_rows = calculate_call_stats(head_stats)
+        base_rows = calculate_call_stats(base_stats, defines)
+        head_rows = calculate_call_stats(head_stats, defines)
          rows = join_rows(base_rows, head_rows)
-        rows.sort(key=lambda x: -float(x[-1][:-1]))
+        rows.sort(key=lambda x: -percentage_to_float(x[-1]))
          emit_table(
              ("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"),
              rows
@@ -584,9 +616,9 @@ def output_single_stats(stats):
      total = get_total(opcode_stats)
      emit_execution_counts(opcode_stats, total)
      emit_pair_counts(opcode_stats, total)
-    emit_specialization_stats(opcode_stats)
-    emit_specialization_overview(opcode_stats, total)
-    emit_call_stats(stats)
+    emit_specialization_stats(opcode_stats, stats["_defines"])
+    emit_specialization_overview(opcode_stats, total, stats["_specialized_instructions"])
+    emit_call_stats(stats, stats["_stats_defines"])
      emit_object_stats(stats)
      emit_gc_stats(stats)
      with Section("Meta stats", summary="Meta statistics"):
@@ -604,12 +636,13 @@ def output_comparative_stats(base_stats, head_stats):
          base_opcode_stats, base_total, head_opcode_stats, head_total
      )
      emit_comparative_specialization_stats(
-        base_opcode_stats, head_opcode_stats
+        base_opcode_stats, head_opcode_stats, head_stats["_defines"]
      )
      emit_comparative_specialization_overview(
-        base_opcode_stats, base_total, head_opcode_stats, head_total
+        base_opcode_stats, base_total, head_opcode_stats, head_total,
+        head_stats["_specialized_instructions"]
      )
-    emit_comparative_call_stats(base_stats, head_stats)
+    emit_comparative_call_stats(base_stats, head_stats, head_stats["_stats_defines"])
      emit_comparative_object_stats(base_stats, head_stats)
      emit_comparative_gc_stats(base_stats, head_stats)
author	Michael Droettboom <mdboom@gmail.com>
	Fri, 15 Sep 2023 20:10:46 +0000 (16:10 -0400)
committer	GitHub <noreply@github.com>
	Fri, 15 Sep 2023 20:10:46 +0000 (13:10 -0700)