--- /dev/null
+#!/usr/bin/env python3
+# pyright: strict
+
+# Copyright 2024 Free Software Foundation, Inc.
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+# Due to the pyelftools dependency, this script requires Python version
+# 3.10 or greater to run.
+
+"""A utility to convert ELF files with DWARF info to Dwarf::assemble code.
+
+Usage:
+ python ./asm_to_dwarf_assembler.py <path/to/elf/file>
+
+Dependencies:
+ Python >= 3.10
+ pyelftools >= 0.31
+
+Notes:
+- Line tables are not currently supported.
+- Non-contiguous subprograms are not currently supported.
+- If you want to use $srcfile or similar, you must edit the references to the
+ file name manually, including DW_AT_name attributes on compile units.
+- If run with binaries generated by make check-gdb, it may include an
+ additional compile_unit before and after the actual compile units. This is
+ an artifact of the normal compilation process, as these CUs are indeed in
+ the generated DWARF in some cases.
+"""
+
+import errno
+import re
+import sys
+from copy import copy
+from dataclasses import dataclass
+from datetime import datetime
+from functools import cache
+from io import BytesIO, IOBase
+from logging import getLogger
+from typing import Annotated, Optional
+
+from elftools.dwarf.compileunit import CompileUnit as RawCompileUnit
+from elftools.dwarf.die import DIE as RawDIE
+from elftools.dwarf.die import AttributeValue
+from elftools.elf.elffile import ELFFile
+
+logger = getLogger(__file__)
+
+
+# While these aren't supported, their detection is important for replacing them
+# with SPECIAL_expr and for writing the placeholder {MANUAL} expr list.
+EXPR_ATTRIBUTE_FORMS = [
+ "DW_FORM_exprloc",
+ "DW_FORM_block",
+ "DW_FORM_block1",
+ "DW_FORM_block2",
+ "DW_FORM_block4",
+]
+
+
+# Workaround for my editor not to freak out over unclosed braces.
+lbrace, rbrace = "{", "}"
+
+
+@cache
+def get_indent_str(indent_count: int) -> str:
+ """Get whitespace string to prepend to another for indenting."""
+ indent = (indent_count // 2) * "\t"
+ if indent_count % 2 == 1:
+ indent += " "
+ return indent
+
+
+def indent(line: str, indent_count: int) -> str:
+ """Indent line by indent_count levels."""
+ return get_indent_str(indent_count) + line
+
+
+def labelify_str(s: str) -> str:
+ """Make s appropriate for a label name."""
+ # Replace "*" with the literal word "ptr".
+ s = s.replace("*", "ptr")
+
+ # Replace any non-"word" characters by "_".
+ s = re.sub(r"\W", "_", s)
+
+ # Remove consecutive "_"s.
+ s = re.sub(r"__+", "_", s)
+
+ return s
+
+
+class DWARFAttribute:
+ """Storage unit for a single DWARF attribute.
+
+ All its values are strings that are usually passed on
+ directly to format. The exceptions to this are attributes
+ with int values with DW_FORM_ref4 or DW_FORM_ref_addr form.
+ Their values are interpreted as the global offset of the DIE
+ being referenced, which are looked up dynamically to fetch
+ their labels.
+ """
+
+ def __init__(
+ self,
+ die_offset: int,
+ name: str,
+ value: str | bytes | int | bool,
+ form=None,
+ ):
+ self.die_offset = die_offset
+ self.name = name
+ self.value = value
+ self.form = form
+
+ def _format_expr_value(self) -> str:
+ self.form = "SPECIAL_expr"
+ return "{ MANUAL: Fill expr list }"
+
+ def _needs_escaping(self, str_value: str) -> bool:
+ charset = set(str_value)
+ return bool(charset.intersection({"{", "}", " ", "\t"}))
+
+ def _format_str(self, str_value: str) -> str:
+ if self._needs_escaping(str_value):
+ escaped_str = str(str_value)
+ # Replace single escape (which is itself escaped because of regex)
+ # with a double escape (which doesn't mean anything to regex so
+ # it doesn't need escaping).
+ escaped_str = re.sub(r"\\", r"\\", escaped_str)
+ escaped_str = re.sub("([{}])", r"\\\1", escaped_str)
+ return "{" + escaped_str + "}"
+ else:
+ return str_value
+
+ def _format_value(
+ self, offset_die_lookup: dict[int, "DWARFDIE"], indent_count: int = 0
+ ) -> str:
+ if self.form in EXPR_ATTRIBUTE_FORMS:
+ return self._format_expr_value()
+ elif isinstance(self.value, bool):
+ return str(int(self.value))
+ elif isinstance(self.value, int):
+ if self.form == "DW_FORM_ref4":
+ # ref4-style referencing label.
+ die = offset_die_lookup[self.value]
+ return ":$" + die.tcl_label
+ elif self.form == "DW_FORM_ref_addr":
+ # ref_addr-style referencing label.
+ die = offset_die_lookup[self.value]
+ return "%$" + die.tcl_label
+ else:
+ return str(self.value)
+ elif isinstance(self.value, bytes):
+ return self._format_str(self.value.decode("ascii"))
+ elif isinstance(self.value, str):
+ return self._format_str(self.value)
+ else:
+ raise NotImplementedError(f"Unknown data type: {type(self.value)}")
+
+ def format(
+ self, offset_die_lookup: dict[int, "DWARFDIE"], indent_count: int = 0
+ ) -> str:
+ """Format the attribute in the form {name value form}.
+
+ If form is DW_FORM_exprloc or DW_FORM_block, see next section on
+ DWARFOperations.
+
+ If it isn't, value is formatted as follows:
+ If bool, use "1" if True, "0" if False.
+ If int:
+ If form is DW_FORM_ref4, use ":$label" where label is the
+ tcl_label of the DWARFDIE at offset "value".
+ If form is DW_FORM_ref_addr, use "%$label" where label is
+ the tcl_label of the DWARFDIE at offset "value".
+ Else, use value directly.
+ If bytes, use value.decode("ascii")
+ If str, use value directly.
+ Any other type results in a NotImplementedError being raised.
+
+ Regarding DW_FORM_exprloc and DW_FORM_block:
+ The form is replaced with SPECIAL_expr.
+ The entries in the value are interpreted and decoded using the
+ dwarf_operations dictionary, and replaced with their names where
+ applicable.
+ """
+ s = lbrace
+ s += self.name + " "
+ s += self._format_value(offset_die_lookup)
+
+ # Only explicitly state form if it's not a reference.
+ if self.form not in [None, "DW_FORM_ref4", "DW_FORM_ref_addr"]:
+ s += " " + self.form
+
+ s += rbrace
+ return indent(s, indent_count)
+
+
+class DWARFDIE:
+ """This script's parsed version of a RawDIE."""
+
+ def __init__(
+ self,
+ offset: int,
+ tag: str,
+ attrs: dict[str, DWARFAttribute],
+ tcl_label: Optional[str] = None,
+ ):
+ self.offset: Annotated[int, "Global offset of the DIE."] = offset
+ self.tag: Annotated[str, "DWARF tag for this DIE."] = tag
+ self.attrs: Annotated[
+ dict[str, DWARFAttribute], "Dict of attributes for this DIE."
+ ] = copy(attrs)
+ self.children: Annotated[list[DWARFDIE], "List of child DIEs of this DIE."] = []
+ self.tcl_label: Annotated[
+ str,
+ "Label used by the Tcl code to reference this DIE, if any. These "
+ 'take the form of "label: " before the actual DIE definition.',
+ ] = tcl_label
+
+ def format_lines(
+ self, offset_die_lookup: dict[int, "DWARFDIE"], indent_count: int = 0
+ ) -> list[str]:
+ """Get the list of lines that represent this DIE in Dwarf assembler."""
+ die_lines = []
+
+ # Prepend label to first line, if it's set.
+ if self.tcl_label:
+ first_line_start = self.tcl_label + ": "
+ else:
+ first_line_start = ""
+
+ # First line, including label.
+ first_line = indent(first_line_start + self.tag + " " + lbrace, indent_count)
+ die_lines.append(first_line)
+
+ # Format attributes, if any.
+ if self.attrs:
+ for attr_name, attr in self.attrs.items():
+ attr_line = attr.format(
+ offset_die_lookup, indent_count=indent_count + 1
+ )
+ die_lines.append(attr_line)
+ die_lines.append(indent(rbrace, indent_count))
+ else:
+ # Don't create a new line, just append and immediately close the
+ # brace on the last line.
+ die_lines[-1] += rbrace
+
+ # Format children, if any.
+ if self.children:
+ # Only open a new brace if there are any children for the
+ # current DIE.
+ die_lines[-1] += " " + lbrace
+ for child in self.children:
+ child_lines = child.format_lines(
+ offset_die_lookup, indent_count=indent_count + 1
+ )
+ die_lines.extend(child_lines)
+ die_lines.append(indent(rbrace, indent_count))
+
+ return die_lines
+
+ def format(
+ self, offset_die_lookup: dict[int, "DWARFDIE"], indent_count: int = 0
+ ) -> str:
+ """Join result from format_lines into a single str."""
+ return "\n".join(self.format_lines(offset_die_lookup, indent_count))
+
+ def name(self) -> Optional[str]:
+ """Get DW_AT_name (if present) decoded as ASCII."""
+ raw_value = self.attrs.get("DW_AT_name")
+ if raw_value is None:
+ return None
+ else:
+ return raw_value.value.decode("ascii")
+
+ def type_name(self) -> str:
+ """Name of Dwarf tag, with the "DW_TAG_" prefix removed."""
+ return re.sub("DW_TAG_", "", self.tag)
+
+
+class DWARFCompileUnit(DWARFDIE):
+ """Wrapper subclass for CU DIEs.
+
+ This is necessary due to the special format CUs take in Dwarf::assemble.
+
+ Instead of simply:
+ DW_TAG_compile_unit {
+ <attributes>
+ } {
+ <children>
+ }
+
+ CUs are formatted as:
+ cu { <cu_special_vars> } {
+ DW_TAG_compile_unit {
+ <attributes>
+ } {
+ <children>
+ }
+ }
+ """
+
+ # Default value for parameter is_64 defined in dwarf.exp line 1553.
+ # This value is converted to 0/1 automatically when emitting
+ # Dwarf::assemble code.
+ default_is_64 = False
+
+ # Default value for parameter dwarf_version defined in dwarf.exp line 1552.
+ default_dwarf_version = 4
+
+ # Default value for parameter is_fission defined in dwarf.exp line 1556.
+ # Currently not implemented, see comment below.
+ # default_is_fission = False
+
+ # Tag that signifies a DIE is a compile unit.
+ compile_unit_tag = "DW_TAG_compile_unit"
+
+ def __init__(
+ self,
+ raw_die: RawDIE,
+ raw_cu: RawCompileUnit,
+ attrs: dict[str, DWARFAttribute],
+ ):
+ """Initialize additional instance variables for CU encoding.
+
+ The additional instance variables are:
+ - is_64_bit: bool
+ Whether this CU is 64 bit or not.
+ - dwarf_version: int
+ default DWARFCompileUnit.default_dwarf_version
+ Version of DWARF this CU is using.
+ - addr_size: Optional[int]
+ default None
+ Size of an address in bytes.
+
+ These variables are used to configure the first parameter of the cu
+ proc (which contains calls to the compile_unit proc in the body of
+ Dwarf::assemble).
+ """
+ super().__init__(raw_die.offset, DWARFCompileUnit.compile_unit_tag, attrs)
+ self.raw_cu = raw_cu
+ self.dwarf_version: int = raw_cu.header.get(
+ "version", DWARFCompileUnit.default_dwarf_version
+ )
+ self.addr_size: Optional[int] = raw_cu.header.get("address_size")
+ self.is_64_bit: bool = raw_cu.dwarf_format() == 64
+
+ # Fission is not currently implemented because I don't know where to
+ # fetch this information from.
+ # self.is_fission: bool = self.default_is_fission
+
+ # CU labels are not currently implemented because I haven't found where
+ # pyelftools exposes this information.
+ # self.cu_label: Optional[str] = None
+
+ def format_lines(
+ self,
+ offset_die_lookup: dict[int, DWARFDIE],
+ indent_count: int = 0,
+ ) -> list[str]:
+ lines = []
+ lines.append(self._get_header(indent_count))
+ inner_lines = super().format_lines(offset_die_lookup, indent_count + 1)
+ lines += inner_lines
+ lines.append(indent(rbrace, indent_count))
+ return lines
+
+ def _get_header(self, indent_count: int = 0) -> str:
+ """Assemble the first line of the surrounding 'cu {} {}' proc call."""
+ header = indent("cu " + lbrace, indent_count)
+ cu_params = []
+
+ if self.is_64_bit != DWARFCompileUnit.default_is_64:
+ # Convert from True/False to 1/0.
+ param_value = int(self.is_64_bit)
+ cu_params += ["is_64", str(param_value)]
+
+ if self.dwarf_version != DWARFCompileUnit.default_dwarf_version:
+ cu_params += ["version", str(self.dwarf_version)]
+
+ if self.addr_size is not None:
+ cu_params += ["addr_size", str(self.addr_size)]
+
+ # Fission is not currently implemented, see comment above.
+ # if self.is_fission != DWARFCompileUnit.default_is_fission:
+ # # Same as is_64_bit conversion, True/False -> 1/0.
+ # param_value = int(self.is_fission)
+ # cu_params += ["fission", str(param_value)]
+
+ # CU labels are not currently implemented, see commend above.
+ # if self.cu_label is not None:
+ # cu_params += ["label", self.cu_label]
+
+ if cu_params:
+ header += " ".join(cu_params)
+
+ header += rbrace + " " + lbrace
+ return header
+
+
+class DWARFParser:
+ """Converter from pyelftools's DWARF representation to this script's."""
+
+ def __init__(self, elf_file: IOBase):
+ """Init parser with file opened in binary mode.
+
+ File can be closed after this function is called.
+ """
+ self.raw_data = BytesIO(elf_file.read())
+ self.elf_data = ELFFile(self.raw_data)
+ self.dwarf_info = self.elf_data.get_dwarf_info()
+ self.offset_to_die: dict[int, DWARFDIE] = {}
+ self.label_to_die: dict[str, DWARFDIE] = {}
+ self.referenced_offsets: Annotated[
+ set[int], "The set of all offsets that were referenced by some DIE."
+ ] = set()
+ self.raw_cu_list: list[RawCompileUnit] = []
+ self.top_level_dies: list[DWARFDIE] = []
+ self.subprograms: list[DWARFDIE] = []
+ self.taken_labels: set[str] = set()
+
+ self._read_all_cus()
+ self._create_necessary_labels()
+
+ def _read_all_cus(self):
+ """Populate self.raw_cu_list with all CUs in self.dwarf_info."""
+ for cu in self.dwarf_info.iter_CUs():
+ self._read_cu(cu)
+
+ def _read_cu(self, raw_cu: RawCompileUnit):
+ """Read a compile_unit into self.cu_list."""
+ self.raw_cu_list.append(raw_cu)
+ for raw_die in raw_cu.iter_DIEs():
+ if not raw_die.is_null():
+ self._parse_die(raw_cu, raw_die)
+
+ def _parse_die(self, die_cu: RawCompileUnit, raw_die: RawDIE) -> DWARFDIE:
+ """Process a single DIE and add it to offset_to_die.
+
+ Look for DW_FORM_ref4 and DWD_FORM_ref_addr form attributes and replace
+ them with the global offset of the referenced DIE, and adding the
+ referenced DIE to a set. This will be used later to assign and use
+ labels only to DIEs that need it.
+
+ In case the DIE is a top-level DIE, add it to self.top_level_dies.
+
+ In case the DIE is a subprogram, add it to self.subprograms and call
+ self._use_vars_for_low_and_high_pc_attr with it.
+ """
+ processed_attrs = {}
+ attr_value: AttributeValue
+ for attr_name, attr_value in raw_die.attributes.items():
+ actual_value = attr_value.value
+ if attr_value.form in ("DW_FORM_ref4", "DW_FORM_ref_addr"):
+ referenced_die = raw_die.get_DIE_from_attribute(attr_name)
+ actual_value = referenced_die.offset
+ self.referenced_offsets.add(referenced_die.offset)
+
+ processed_attrs[attr_name] = DWARFAttribute(
+ raw_die.offset, attr_name, actual_value, attr_value.form
+ )
+
+ if raw_die.tag == DWARFCompileUnit.compile_unit_tag:
+ processed_die = DWARFCompileUnit(raw_die, die_cu, processed_attrs)
+ else:
+ processed_die = DWARFDIE(raw_die.offset, raw_die.tag, processed_attrs, None)
+
+ if raw_die.get_parent() is None:
+ # Top level DIE
+ self.top_level_dies.append(processed_die)
+ else:
+ # Setting the parent here assumes the parent was already processed
+ # prior to this DIE being found.
+ # As far as I'm aware, this is always true in DWARF.
+ processed_parent = self.offset_to_die[raw_die.get_parent().offset]
+ processed_parent.children.append(processed_die)
+
+ if processed_die.tag == "DW_TAG_subprogram":
+ self.subprograms.append(processed_die)
+ self._use_vars_for_low_and_high_pc_attr(processed_die)
+
+ self.offset_to_die[processed_die.offset] = processed_die
+ return processed_die
+
+ def _create_necessary_labels(self):
+ """Create labels to DIEs that were referenced by others."""
+ for offset in self.referenced_offsets:
+ die = self.offset_to_die[offset]
+ self._create_label_for_die(die)
+
+ def _use_vars_for_low_and_high_pc_attr(self, subprogram: DWARFDIE) -> None:
+ """Replace existing PC attributes with Tcl variables.
+
+ If DW_AT_low_pc exists for this DIE, replace it with accessing the
+ variable whose name is given by self.subprogram_start_var(subprogram).
+
+ If DW_AT_high_pc exists for this DIE, replace it with accessing the
+ variable whose name is given by self.subprogram_end_var(subprogram).
+ """
+ low_pc_attr_name = "DW_AT_low_pc"
+ if low_pc_attr_name in subprogram.attrs:
+ start = self.subprogram_start_var(subprogram)
+ subprogram.attrs[low_pc_attr_name].value = start
+
+ high_pc_attr_name = "DW_AT_high_pc"
+ if high_pc_attr_name in subprogram.attrs:
+ end = self.subprogram_end_var(subprogram)
+ subprogram.attrs[high_pc_attr_name].value = end
+
+ def _create_label_for_die(self, die: DWARFDIE) -> None:
+ """Set tcl_label to a unique string among other DIEs for this parser.
+
+ As a first attempt, use labelify(die.name()). If the DIE does not have
+ a name, use labelify(die.type_name()).
+
+ If the chosen initial label is already taken, try again appending "_2".
+ While the attempt is still taken, try again replacing it with "_3", then
+ "_4", and so on.
+
+ This function also creates an entry on self.label_to_die.
+ """
+ if die.tcl_label is not None:
+ return
+
+ label = labelify_str(die.name() or die.type_name())
+
+ # Deduplicate label in case of collision
+ if label in self.taken_labels:
+ suffix_nr = 2
+
+ # Walrus operator to prevent writing the assembled label_suffix
+ # string literal twice. This could be rewritten by copying the
+ # string literal to the line after the end of the while loop,
+ # but I deemed it would be too frail in case one of them needs
+ # to be changed and the other is forgotten.
+ while (new_label := f"{label}_{suffix_nr}") in self.taken_labels:
+ suffix_nr += 1
+ label = new_label
+
+ die.tcl_label = label
+ self.label_to_die[label] = die
+ self.taken_labels.add(label)
+
+ def subprogram_start_var(self, subprogram: DWARFDIE) -> str:
+ """Name of the Tcl variable that holds the low PC for a subprogram."""
+ return f"${subprogram.name()}_start"
+
+ def subprogram_end_var(self, subprogram: DWARFDIE) -> str:
+ """Name of the Tcl variable that holds the high PC for a subprogram."""
+ return f"${subprogram.name()}_end"
+
+ def all_labels(self) -> set[str]:
+ """Get a copy of the set of all labels known to the parser so far."""
+ return copy(self.taken_labels)
+
+
+class DWARFAssemblerGenerator:
+ """Class that generates Dwarf::assemble code out of a DWARFParser."""
+
+ def __init__(self, dwarf_parser: DWARFParser, output=sys.stdout):
+ self.dwarf_parser = dwarf_parser
+ self.output = output
+
+ def emit(self, line: str, indent_count: int) -> None:
+ """Print a single line indented indent_count times to self.output.
+
+ If line is empty, it will always print an empty line, even with nonzero
+ indent_count.
+ """
+ if line:
+ line = get_indent_str(indent_count) + line
+ print(line, file=self.output)
+
+ def generate_die(self, die: DWARFDIE, indent_count: int):
+ """Generate the lines that represent a DIE."""
+ die_lines = die.format(self.dwarf_parser.offset_to_die, indent_count)
+ self.emit(die_lines, 0)
+
+ def generate(self):
+ indent_count = 0
+
+ self.emit("Dwarf::assemble $asm_file {", indent_count)
+
+ # Begin Dwarf::assemble body.
+ indent_count += 1
+ self.emit("global srcdir subdir srcfile", indent_count)
+
+ all_labels = self.dwarf_parser.all_labels()
+ if all_labels:
+ self.emit("declare_labels " + " ".join(all_labels), indent_count)
+
+ self.emit("", 0)
+ for subprogram in self.dwarf_parser.subprograms:
+ self.emit(f"get_func_info {subprogram.name()}", indent_count)
+
+ for die in self.dwarf_parser.top_level_dies:
+ self.generate_die(die, indent_count)
+
+ # TODO: line table, if it's within scope (it probably isn't).
+
+ # End Dwarf::assemble body.
+ indent_count -= 1
+ self.emit(rbrace, indent_count)
+
+
+def main(argv):
+ try:
+ filename = argv[1]
+ except IndexError:
+ print("Usage:", file=sys.stderr)
+ print("python ./asm_to_dwarf_assembler.py <path/to/elf/file>", file=sys.stderr)
+ sys.exit(errno.EOPNOTSUP)
+
+ try:
+ with open(filename, "rb") as elf_file:
+ parser = DWARFParser(elf_file)
+ except Exception as e:
+ print(
+ "Error parsing ELF file. Does it contain DWARF information?",
+ file=sys.stderr,
+ )
+ print(str(e), file=sys.stderr)
+ sys.exit(errno.ENODATA)
+ generator = DWARFAssemblerGenerator(parser)
+ generator.generate()
+
+
+if __name__ == "__main__":
+ main(sys.argv)