(Proposal) Adjustments to "decomp" language (#308)

* Adjustments to "decomp" language * Fix a comment * Fix accidental clang-formatting * Fix order * Fix order * Remove junk * Fix OFFSET * Adjustments based on new suggestions * Annotate globals * Globals in ISLE * More globals * Merge from parser2 branch * Allow prepending space for exact marker match * To eliminate noise, require the 0x prefix on offset for marker match * fix test from previous * Count tab stops for indented functions to reduce MISSED_END_OF_FUNCTION noise * FUNCTION to SYNTHETIC where needed * Missed marker conversion on SetAtomId * pylint cleanup, remove unused code * Fix unexpected function end, add more unit tests * Be more strict about synthetic name syntax * Revert "Missed marker conversion on SetAtomId" This reverts commit d87d665127. * Revert "FUNCTION to SYNTHETIC where needed" This reverts commit 8c815418d2. * Implicit lookup by name for functions * Fix VTABLE SYNTHETIC and other decomp markers * Get vtable class name * Vtable marker should identify struct * No colon for SIZE comment * Update README.md * Update README.md * Update CONTRIBUTING.md * Update README.md * Update README.md * Update CONTRIBUTING.md * Update README.md * Update CONTRIBUTING.md * Fix destructor/annotation * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md --------- Co-authored-by: disinvite <disinvite@users.noreply.github.com>
2025-12-11 08:33:13 +00:00 · 2023-12-06 07:10:45 -05:00
parent 4f5b70013f
commit 494a556f8e
407 changed files with 3505 additions and 2493 deletions
--- a/tools/isledecomp/isledecomp/parser/init.py
+++ b/tools/isledecomp/isledecomp/parser/init.py
@@ -1 +1 @@
-from .parser import find_code_blocks
+from .parser import DecompParser
--- a/tools/isledecomp/isledecomp/parser/error.py
+++ b/tools/isledecomp/isledecomp/parser/error.py
@@ -0,0 +1,41 @@
+from enum import Enum
+
+
+class ParserError(Enum):
+    # WARN: Stub function exceeds some line number threshold
+    UNLIKELY_STUB = 100
+
+    # WARN: Decomp marker is close enough to be recognized, but does not follow syntax exactly
+    BAD_DECOMP_MARKER = 101
+
+    # WARN: Multiple markers in sequence do not have distinct modules
+    DUPLICATE_MODULE = 102
+
+    # WARN: Detected a dupcliate module/offset pair in the current file
+    DUPLICATE_OFFSET = 103
+
+    # WARN: We read a line that matches the decomp marker pattern, but we are not set up
+    # to handle it
+    BOGUS_MARKER = 104
+
+    # WARN: New function marker appeared while we were inside a function
+    MISSED_END_OF_FUNCTION = 105
+
+    # WARN: If we find a curly brace right after the function declaration
+    # this is wrong but we still have enough to make a match with reccmp
+    MISSED_START_OF_FUNCTION = 106
+
+    # WARN: A blank line appeared between the end of FUNCTION markers
+    # and the start of the function. We can ignore it, but the line shouldn't be there
+    UNEXPECTED_BLANK_LINE = 107
+
+    # ERROR: We found a marker unexpectedly
+    UNEXPECTED_MARKER = 200
+
+    # ERROR: We found a marker where we expected to find one, but it is incompatible
+    # with the preceding markers.
+    # For example, a GLOBAL cannot follow FUNCTION/STUB
+    INCOMPATIBLE_MARKER = 201
+
+    # ERROR: The line following a synthetic marker was not a comment
+    BAD_SYNTHETIC = 202
--- a/tools/isledecomp/isledecomp/parser/node.py
+++ b/tools/isledecomp/isledecomp/parser/node.py
@@ -0,0 +1,41 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class ParserNode:
+    line_number: int
+
+
+@dataclass
+class ParserAlert(ParserNode):
+    code: int
+    line: str
+
+
+@dataclass
+class ParserSymbol(ParserNode):
+    module: str
+    offset: int
+
+
+@dataclass
+class ParserFunction(ParserSymbol):
+    name: str
+    lookup_by_name: bool = False
+    is_stub: bool = False
+    is_synthetic: bool = False
+    is_template: bool = False
+    end_line: int = -1
+
+
+@dataclass
+class ParserVariable(ParserSymbol):
+    name: str
+    size: int = -1
+    is_static: bool = False
+
+
+@dataclass
+class ParserVtable(ParserSymbol):
+    class_name: str
+    num_entries: int = -1
--- a/tools/isledecomp/isledecomp/parser/parser.py
+++ b/tools/isledecomp/isledecomp/parser/parser.py
@@ -1,145 +1,394 @@
 # C++ file parser

-from typing import List, TextIO
+from typing import List, Iterable, Iterator
 from enum import Enum
 from .util import (
-    CodeBlock,
-    OffsetMatch,
+    DecompMarker,
    is_blank_or_comment,
-    match_offset_comment,
-    get_template_function_name,
+    match_marker,
+    is_marker_exact,
+    get_class_name,
+    get_synthetic_name,
    remove_trailing_comment,
-    distinct_by_module,
 )
+from .node import (
+    ParserAlert,
+    ParserFunction,
+    ParserVariable,
+    ParserVtable,
+)
+from .error import ParserError


 class ReaderState(Enum):
-    WANT_OFFSET = 0
+    SEARCH = 0
    WANT_SIG = 1
    IN_FUNC = 2
    IN_TEMPLATE = 3
    WANT_CURLY = 4
-    FUNCTION_DONE = 5
+    IN_GLOBAL = 5
+    IN_FUNC_GLOBAL = 6
+    IN_VTABLE = 7


-def find_code_blocks(stream: TextIO) -> List[CodeBlock]:
-    """Read the IO stream (file) line-by-line and give the following report:
-    Foreach code block (function) in the file, what are its starting and
-    ending line numbers, and what is the given offset in the original
-    binary. We expect the result to be ordered by line number because we
-    are reading the file from start to finish."""
+def marker_is_stub(marker: DecompMarker) -> bool:
+    return marker.type.upper() == "STUB"

-    blocks: List[CodeBlock] = []

-    offset_matches: List[OffsetMatch] = []
+def marker_is_variable(marker: DecompMarker) -> bool:
+    return marker.type.upper() == "GLOBAL"

-    function_sig = None
-    start_line = None
-    end_line = None
-    state = ReaderState.WANT_OFFSET

-    # 1-based to match cvdump and your text editor
-    # I know it says 0, but we will increment before each readline()
-    line_no = 0
-    can_seek = True
+def marker_is_synthetic(marker: DecompMarker) -> bool:
+    return marker.type.upper() in ("SYNTHETIC", "TEMPLATE")

-    while True:
-        # Do this before reading again so that an EOF will not
-        # cause us to miss the last function of the file.
-        if state == ReaderState.FUNCTION_DONE:
-            # Our list of offset marks could have duplicates on
-            # module name, so we'll eliminate those now.
-            for offset_match in distinct_by_module(offset_matches):
-                block = CodeBlock(
-                    offset=offset_match.address,
-                    signature=function_sig,
-                    start_line=start_line,
+
+def marker_is_template(marker: DecompMarker) -> bool:
+    return marker.type.upper() == "TEMPLATE"
+
+
+def marker_is_function(marker: DecompMarker) -> bool:
+    return marker.type.upper() in ("FUNCTION", "STUB")
+
+
+def marker_is_vtable(marker: DecompMarker) -> bool:
+    return marker.type.upper() == "VTABLE"
+
+
+class MarkerDict:
+    def __init__(self):
+        self.markers: dict = {}
+
+    def insert(self, marker: DecompMarker) -> bool:
+        """Return True if this insert would overwrite"""
+        module = marker.module.upper()
+        if module in self.markers:
+            return True
+
+        self.markers[module] = (marker.type, marker.offset)
+        return False
+
+    def iter(self) -> Iterator[DecompMarker]:
+        for module, (marker_type, offset) in self.markers.items():
+            yield DecompMarker(marker_type, module, offset)
+
+    def empty(self):
+        self.markers = {}
+
+
+class DecompParser:
+    # pylint: disable=too-many-instance-attributes
+    # Could combine output lists into a single list to get under the limit,
+    # but not right now
+    def __init__(self):
+        # The lists to be populated as we parse
+        self.functions: List[ParserFunction] = []
+        self.vtables: List[ParserVtable] = []
+        self.variables: List[ParserVariable] = []
+        self.alerts: List[ParserAlert] = []
+
+        self.line_number: int = 0
+        self.state: ReaderState = ReaderState.SEARCH
+
+        self.last_line: str = ""
+
+        # To allow for multiple markers where code is shared across different
+        # modules, save lists of compatible markers that appear in sequence
+        self.fun_markers = MarkerDict()
+        self.var_markers = MarkerDict()
+        self.tbl_markers = MarkerDict()
+
+        # To handle functions that are entirely indented (i.e. those defined
+        # in class declarations), remember how many whitespace characters
+        # came before the opening curly brace and match that up at the end.
+        # This should give us the same or better accuracy for a well-formed file.
+        # The alternative is counting the curly braces on each line
+        # but that's probably too cumbersome.
+        self.curly_indent_stops: int = 0
+
+        # For non-synthetic functions, save the line number where the function begins
+        # (i.e. where we see the curly brace) along with the function signature.
+        # We will need both when we reach the end of the function.
+        self.function_start: int = 0
+        self.function_sig: str = ""
+
+    def reset(self):
+        self.functions = []
+        self.vtables = []
+        self.variables = []
+        self.alerts = []
+
+        self.line_number = 0
+        self.state = ReaderState.SEARCH
+
+        self.last_line = ""
+
+        self.fun_markers.empty()
+        self.var_markers.empty()
+        self.tbl_markers.empty()
+
+        self.curly_indent_stops = 0
+        self.function_start = 0
+        self.function_sig = ""
+
+    def _recover(self):
+        """We hit a syntax error and need to reset temp structures"""
+        self.state = ReaderState.SEARCH
+        self.fun_markers.empty()
+        self.var_markers.empty()
+        self.tbl_markers.empty()
+
+    def _syntax_warning(self, code):
+        self.alerts.append(
+            ParserAlert(
+                line_number=self.line_number,
+                code=code,
+                line=self.last_line.strip(),
+            )
+        )
+
+    def _syntax_error(self, code):
+        self._syntax_warning(code)
+        self._recover()
+
+    def _function_starts_here(self):
+        self.function_start = self.line_number
+
+    def _function_marker(self, marker: DecompMarker):
+        if self.fun_markers.insert(marker):
+            self._syntax_warning(ParserError.DUPLICATE_MODULE)
+        self.state = ReaderState.WANT_SIG
+
+    def _synthetic_marker(self, marker: DecompMarker):
+        if self.fun_markers.insert(marker):
+            self._syntax_warning(ParserError.DUPLICATE_MODULE)
+        self.state = ReaderState.IN_TEMPLATE
+
+    def _function_done(self, lookup_by_name: bool = False, unexpected: bool = False):
+        end_line = self.line_number
+        if unexpected:
+            # If we missed the end of the previous function, assume it ended
+            # on the previous line and that whatever we are tracking next
+            # begins on the current line.
+            end_line -= 1
+
+        for marker in self.fun_markers.iter():
+            self.functions.append(
+                ParserFunction(
+                    line_number=self.function_start,
+                    module=marker.module,
+                    offset=marker.offset,
+                    lookup_by_name=lookup_by_name,
+                    is_stub=marker_is_stub(marker),
+                    is_synthetic=marker_is_synthetic(marker),
+                    is_template=marker_is_template(marker),
+                    name=self.function_sig,
                    end_line=end_line,
-                    offset_comment=offset_match.comment,
-                    module=offset_match.module,
-                    is_template=offset_match.is_template,
-                    is_stub=offset_match.is_stub,
                )
-                blocks.append(block)
-            offset_matches = []
-            state = ReaderState.WANT_OFFSET
+            )

-        if can_seek:
-            line_no += 1
-            line = stream.readline()
-            if line == "":
-                break
+        self.fun_markers.empty()
+        self.curly_indent_stops = 0
+        self.state = ReaderState.SEARCH

-        new_match = match_offset_comment(line)
-        if new_match is not None:
-            # We will allow multiple offsets if we have just begun
-            # the code block, but not after we hit the curly brace.
-            if state in (
-                ReaderState.WANT_OFFSET,
-                ReaderState.IN_TEMPLATE,
+    def _vtable_marker(self, marker: DecompMarker):
+        if self.tbl_markers.insert(marker):
+            self._syntax_warning(ParserError.DUPLICATE_MODULE)
+        self.state = ReaderState.IN_VTABLE
+
+    def _vtable_done(self, class_name: str = None):
+        if class_name is None:
+            # Best we can do
+            class_name = self.last_line.strip()
+
+        for marker in self.tbl_markers.iter():
+            self.vtables.append(
+                ParserVtable(
+                    line_number=self.line_number,
+                    module=marker.module,
+                    offset=marker.offset,
+                    class_name=class_name,
+                )
+            )
+
+        self.tbl_markers.empty()
+        self.state = ReaderState.SEARCH
+
+    def _variable_marker(self, marker: DecompMarker):
+        if self.var_markers.insert(marker):
+            self._syntax_warning(ParserError.DUPLICATE_MODULE)
+
+        if self.state in (ReaderState.IN_FUNC, ReaderState.IN_FUNC_GLOBAL):
+            self.state = ReaderState.IN_FUNC_GLOBAL
+        else:
+            self.state = ReaderState.IN_GLOBAL
+
+    def _variable_done(self):
+        for marker in self.var_markers.iter():
+            self.variables.append(
+                ParserVariable(
+                    line_number=self.line_number,
+                    module=marker.module,
+                    offset=marker.offset,
+                    name=self.last_line.strip(),
+                )
+            )
+
+        self.var_markers.empty()
+        if self.state == ReaderState.IN_FUNC_GLOBAL:
+            self.state = ReaderState.IN_FUNC
+        else:
+            self.state = ReaderState.SEARCH
+
+    def _handle_marker(self, marker: DecompMarker):
+        # Cannot handle any markers between function sig and opening curly brace
+        if self.state == ReaderState.WANT_CURLY:
+            self._syntax_error(ParserError.UNEXPECTED_MARKER)
+            return
+
+        # TODO: How uncertain are we of detecting the end of a function
+        # in a clang-formatted file? For now we assume we have missed the
+        # end if we detect a non-GLOBAL marker while state is IN_FUNC.
+        # Maybe these cases should be syntax errors instead
+
+        if marker_is_function(marker):
+            if self.state in (
+                ReaderState.SEARCH,
                ReaderState.WANT_SIG,
            ):
-                # If we detected an offset marker unexpectedly,
-                # we are handling it here so we can continue seeking.
-                can_seek = True
-
-                offset_matches.append(new_match)
-
-                if new_match.is_template:
-                    state = ReaderState.IN_TEMPLATE
-                else:
-                    state = ReaderState.WANT_SIG
-            else:
+                # We will allow multiple offsets if we have just begun
+                # the code block, but not after we hit the curly brace.
+                self._function_marker(marker)
+            elif self.state == ReaderState.IN_FUNC:
                # We hit another offset unexpectedly.
                # We can recover easily by just ending the function here.
-                end_line = line_no - 1
-                state = ReaderState.FUNCTION_DONE
+                self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
+                self._function_done(unexpected=True)

-                # Pause reading here so we handle the offset marker
-                # on the next loop iteration
-                can_seek = False
+                # Start the next function right after so we can
+                # read the next line.
+                self._function_marker(marker)
+            else:
+                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)

-        elif state == ReaderState.IN_TEMPLATE:
+        elif marker_is_synthetic(marker):
+            if self.state in (ReaderState.SEARCH, ReaderState.IN_TEMPLATE):
+                self._synthetic_marker(marker)
+            elif self.state == ReaderState.IN_FUNC:
+                self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
+                self._function_done(lookup_by_name=True, unexpected=True)
+                self._synthetic_marker(marker)
+            else:
+                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
+
+        elif marker_is_variable(marker):
+            if self.state in (
+                ReaderState.SEARCH,
+                ReaderState.IN_GLOBAL,
+                ReaderState.IN_FUNC,
+                ReaderState.IN_FUNC_GLOBAL,
+            ):
+                self._variable_marker(marker)
+            else:
+                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
+
+        elif marker_is_vtable(marker):
+            if self.state in (ReaderState.SEARCH, ReaderState.IN_VTABLE):
+                self._vtable_marker(marker)
+            elif self.state == ReaderState.IN_FUNC:
+                self._syntax_warning(ParserError.MISSED_END_OF_FUNCTION)
+                self._function_done(unexpected=True)
+                self._vtable_marker(marker)
+            else:
+                self._syntax_error(ParserError.INCOMPATIBLE_MARKER)
+
+        else:
+            self._syntax_warning(ParserError.BOGUS_MARKER)
+
+    def read_line(self, line: str):
+        self.last_line = line  # TODO: Useful or hack for error reporting?
+        self.line_number += 1
+
+        marker = match_marker(line)
+        if marker is not None:
+            # TODO: what's the best place for this?
+            # Does it belong with reading or marker handling?
+            if not is_marker_exact(self.last_line):
+                self._syntax_warning(ParserError.BAD_DECOMP_MARKER)
+            self._handle_marker(marker)
+            return
+
+        line_strip = line.strip()
+        if self.state == ReaderState.IN_TEMPLATE:
            # TEMPLATE functions are a special case. The signature is
            # given on the next line (in a // comment)
-            function_sig = get_template_function_name(line)
-            start_line = line_no
-            end_line = line_no
-            state = ReaderState.FUNCTION_DONE
+            name = get_synthetic_name(line)
+            if name is None:
+                self._syntax_error(ParserError.BAD_SYNTHETIC)
+            else:
+                self.function_sig = name
+                self._function_starts_here()
+                self._function_done(lookup_by_name=True)

-        elif state == ReaderState.WANT_SIG:
-            # Skip blank lines or comments that come after the offset
-            # marker. There is not a formal procedure for this, so just
-            # assume the next "code line" is the function signature
-            if not is_blank_or_comment(line):
+        elif self.state == ReaderState.WANT_SIG:
+            # Ignore blanks on the way to function start or function name
+            if len(line_strip) == 0:
+                self._syntax_warning(ParserError.UNEXPECTED_BLANK_LINE)
+
+            elif line_strip.startswith("//"):
+                # If we found a comment, assume implicit lookup-by-name
+                # function and end here. We know this is not a decomp marker
+                # because it would have been handled already.
+                self.function_sig = get_synthetic_name(line)
+                self._function_starts_here()
+                self._function_done(lookup_by_name=True)
+
+            elif line_strip == "{":
+                # We missed the function signature but we can recover from this
+                self.function_sig = "(unknown)"
+                self._function_starts_here()
+                self._syntax_warning(ParserError.MISSED_START_OF_FUNCTION)
+                self.state = ReaderState.IN_FUNC
+
+            else:
                # Inline functions may end with a comment. Strip that out
                # to help parsing.
-                function_sig = remove_trailing_comment(line.strip())
+                self.function_sig = remove_trailing_comment(line_strip)

                # Now check to see if the opening curly bracket is on the
                # same line. clang-format should prevent this (BraceWrapping)
                # but it is easy to detect.
                # If the entire function is on one line, handle that too.
-                if function_sig.endswith("{"):
-                    start_line = line_no
-                    state = ReaderState.IN_FUNC
-                elif function_sig.endswith("}") or function_sig.endswith("};"):
-                    start_line = line_no
-                    end_line = line_no
-                    state = ReaderState.FUNCTION_DONE
+                if self.function_sig.endswith("{"):
+                    self._function_starts_here()
+                    self.state = ReaderState.IN_FUNC
+                elif self.function_sig.endswith("}") or self.function_sig.endswith(
+                    "};"
+                ):
+                    self._function_starts_here()
+                    self._function_done()
                else:
-                    state = ReaderState.WANT_CURLY
+                    self.state = ReaderState.WANT_CURLY

-        elif state == ReaderState.WANT_CURLY:
-            if line.strip() == "{":
-                start_line = line_no
-                state = ReaderState.IN_FUNC
+        elif self.state == ReaderState.WANT_CURLY:
+            if line_strip == "{":
+                self.curly_indent_stops = line.index("{")
+                self._function_starts_here()
+                self.state = ReaderState.IN_FUNC

-        elif state == ReaderState.IN_FUNC:
-            # Naive but reasonable assumption that functions will end with
-            # a curly brace on its own line with no prepended spaces.
-            if line.startswith("}"):
-                end_line = line_no
-                state = ReaderState.FUNCTION_DONE
+        elif self.state == ReaderState.IN_FUNC:
+            if line_strip.startswith("}") and line[self.curly_indent_stops] == "}":
+                self._function_done()

-    return blocks
+        elif self.state in (ReaderState.IN_GLOBAL, ReaderState.IN_FUNC_GLOBAL):
+            if not is_blank_or_comment(line):
+                self._variable_done()
+
+        elif self.state == ReaderState.IN_VTABLE:
+            vtable_class = get_class_name(line)
+            if vtable_class is not None:
+                self._vtable_done(class_name=vtable_class)
+
+    def read_lines(self, lines: Iterable):
+        for line in lines:
+            self.read_line(line)
--- a/tools/isledecomp/isledecomp/parser/util.py
+++ b/tools/isledecomp/isledecomp/parser/util.py
@@ -1,44 +1,17 @@
 # C++ Parser utility functions and data structures
 from __future__ import annotations  # python <3.10 compatibility
 import re
-from typing import List
 from collections import namedtuple

+DecompMarker = namedtuple("DecompMarker", ["type", "module", "offset"])

-CodeBlock = namedtuple(
-    "CodeBlock",
-    [
-        "offset",
-        "signature",
-        "start_line",
-        "end_line",
-        "offset_comment",
-        "module",
-        "is_template",
-        "is_stub",
-    ],
-)

-OffsetMatch = namedtuple(
-    "OffsetMatch", ["module", "address", "is_template", "is_stub", "comment"]
-)
-
-# This has not been formally established, but considering that "STUB"
-# is a temporary state for a function, we assume it will appear last,
-# after any other modifiers (i.e. TEMPLATE)
-
-# To match a reasonable variance of formatting for the offset comment
-offsetCommentRegex = re.compile(
-    r"\s*//\s*OFFSET:\s*(\w+)\s+(?:0x)?([a-f0-9]+)(\s+TEMPLATE)?(\s+STUB)?",  # nopep8
+markerRegex = re.compile(
+    r"\s*//\s*(\w+):\s*(\w+)\s+(0x[a-f0-9]+)",
    flags=re.I,
 )

-# To match the exact syntax (text upper case, hex lower case, with spaces)
-# that is used in most places
-offsetCommentExactRegex = re.compile(
-    r"^// OFFSET: [A-Z0-9]+ (0x[a-f0-9]+)( TEMPLATE)?( STUB)?$"
-)  # nopep8
-
+markerExactRegex = re.compile(r"\s*// ([A-Z]+): ([A-Z0-9]+) (0x[a-f0-9]+)$")

 # The goal here is to just read whatever is on the next line, so some
 # flexibility in the formatting seems OK
@@ -50,15 +23,15 @@ templateCommentRegex = re.compile(r"\s*//\s+(.*)")
 trailingCommentRegex = re.compile(r"(\s*(?://|/\*).*)$")


-def get_template_function_name(line: str) -> str:
-    """Parse function signature for special TEMPLATE functions"""
+def get_synthetic_name(line: str) -> str | None:
+    """Synthetic names appear on a single line comment on the line after the marker.
+    If that's not what we have, return None"""
    template_match = templateCommentRegex.match(line)

-    # If we don't match, you get whatever is on the line as the signature
    if template_match is not None:
        return template_match.group(1)

-    return line
+    return None


 def remove_trailing_comment(line: str) -> str:
@@ -78,39 +51,45 @@ def is_blank_or_comment(line: str) -> bool:
    )


-def is_exact_offset_comment(line: str) -> bool:
-    """If the offset comment does not match our (unofficial) syntax
-    we may want to alert the user to fix it for style points."""
-    return offsetCommentExactRegex.match(line) is not None
-
-
-def match_offset_comment(line: str) -> OffsetMatch | None:
-    match = offsetCommentRegex.match(line)
+def match_marker(line: str) -> DecompMarker | None:
+    match = markerRegex.match(line)
    if match is None:
        return None

-    return OffsetMatch(
-        module=match.group(1),
-        address=int(match.group(2), 16),
-        is_template=match.group(3) is not None,
-        is_stub=match.group(4) is not None,
-        comment=line.strip(),
+    return DecompMarker(
+        type=match.group(1), module=match.group(2), offset=int(match.group(3), 16)
    )


-def distinct_by_module(offsets: List) -> List:
-    """Given a list of offset markers, return a list with distinct
-    module names. If module names (case-insensitive) are repeated,
-    choose the offset that appears first."""
+def is_marker_exact(line: str) -> bool:
+    return markerExactRegex.match(line) is not None

-    if len(offsets) < 2:
-        return offsets

-    # Dict maintains insertion order in python >=3.7
-    offsets_dict = {}
-    for offset in offsets:
-        module_upper = offset.module.upper()
-        if module_upper not in offsets_dict:
-            offsets_dict[module_upper] = offset
+template_class_decl_regex = re.compile(
+    r"\s*(?:\/\/)?\s*(?:class|struct) (\w+)<([\w]+)\s*(\*+)?\s*>"
+)

-    return list(offsets_dict.values())
+
+class_decl_regex = re.compile(r"\s*(?:\/\/)?\s*(?:class|struct) (\w+)")
+
+
+def get_class_name(line: str) -> str | None:
+    """For VTABLE markers, extract the class name from the code line or comment
+    where it appears."""
+
+    match = template_class_decl_regex.match(line)
+    if match is not None:
+        # For template classes, we should reformat the class name so it matches
+        # the output from cvdump: one space between the template type and any asterisks
+        # if it is a pointer type.
+        (class_name, template_type, asterisks) = match.groups()
+        if asterisks is not None:
+            return f"{class_name}<{template_type} {asterisks}>"
+
+        return f"{class_name}<{template_type}>"
+
+    match = class_decl_regex.match(line)
+    if match is not None:
+        return match.group(1)
+
+    return None