isle/tools/isledecomp/isledecomp/utils.py

import os
import sys
from datetime import datetime
import logging
import colorama


def print_combined_diff(udiff, plain: bool = False, show_both: bool = False):
    if udiff is None:
        return

    # We don't know how long the address string will be ahead of time.
    # Set this value for each address to try to line things up.
    padding_size = 0

    for slug, subgroups in udiff:
        if plain:
            print("---")
            print("+++")
            print(slug)
        else:
            print(f"{colorama.Fore.RED}---")
            print(f"{colorama.Fore.GREEN}+++")
            print(f"{colorama.Fore.BLUE}{slug}")
            print(colorama.Style.RESET_ALL, end="")

        for subgroup in subgroups:
            equal = subgroup.get("both") is not None

            if equal:
                for orig_addr, line, recomp_addr in subgroup["both"]:
                    padding_size = max(padding_size, len(orig_addr))
                    if show_both:
                        print(f"{orig_addr} / {recomp_addr} : {line}")
                    else:
                        print(f"{orig_addr} : {line}")
            else:
                for orig_addr, line in subgroup["orig"]:
                    padding_size = max(padding_size, len(orig_addr))
                    addr_prefix = (
                        f"{orig_addr} / {'':{padding_size}}" if show_both else orig_addr
                    )

                    if plain:
                        print(f"{addr_prefix} : -{line}")
                    else:
                        print(
                            f"{addr_prefix} : {colorama.Fore.RED}-{line}{colorama.Style.RESET_ALL}"
                        )

                for recomp_addr, line in subgroup["recomp"]:
                    padding_size = max(padding_size, len(recomp_addr))
                    addr_prefix = (
                        f"{'':{padding_size}} / {recomp_addr}"
                        if show_both
                        else recomp_addr
                    )

                    if plain:
                        print(f"{addr_prefix} : +{line}")
                    else:
                        print(
                            f"{addr_prefix} : {colorama.Fore.GREEN}+{line}{colorama.Style.RESET_ALL}"
                        )

        # Newline between each diff subgroup.
        print()


def print_diff(udiff, plain):
    """Print diff in difflib.unified_diff format."""
    if udiff is None:
        return False

    has_diff = False
    for line in udiff:
        has_diff = True
        color = ""
        if line.startswith("++") or line.startswith("@@") or line.startswith("--"):
            # Skip unneeded parts of the diff for the brief view
            continue
        # Work out color if we are printing color
        if not plain:
            if line.startswith("+"):
                color = colorama.Fore.GREEN
            elif line.startswith("-"):
                color = colorama.Fore.RED
        print(color + line)
        # Reset color if we're printing in color
        if not plain:
            print(colorama.Style.RESET_ALL, end="")
    return has_diff


def get_percent_color(value: float) -> str:
    """Return colorama ANSI escape character for the given decimal value."""
    if value == 1.0:
        return colorama.Fore.GREEN
    if value > 0.8:
        return colorama.Fore.YELLOW

    return colorama.Fore.RED


def percent_string(
    ratio: float, is_effective: bool = False, is_plain: bool = False
) -> str:
    """Helper to construct a percentage string from the given ratio.
    If is_effective (i.e. effective match), indicate that with the asterisk.
    If is_plain, don't use colorama ANSI codes."""

    percenttext = f"{(ratio * 100):.2f}%"
    effective_star = "*" if is_effective else ""

    if is_plain:
        return percenttext + effective_star

    return "".join(
        [
            get_percent_color(ratio),
            percenttext,
            colorama.Fore.RED if is_effective else "",
            effective_star,
            colorama.Style.RESET_ALL,
        ]
    )


def diff_json_display(show_both_addrs: bool = False, is_plain: bool = False):
    """Generate a function that will display the diff according to
    the reccmp display preferences."""

    def formatter(orig_addr, saved, new) -> str:
        old_pct = "new"
        new_pct = "gone"
        name = ""
        recomp_addr = "n/a"

        if new is not None:
            new_pct = (
                "stub"
                if new.get("stub", False)
                else percent_string(
                    new["matching"], new.get("effective", False), is_plain
                )
            )

            # Prefer the current name of this function if we have it.
            # We are using the original address as the key.
            # A function being renamed is not of interest here.
            name = new.get("name", "")
            recomp_addr = new.get("recomp", "n/a")

        if saved is not None:
            old_pct = (
                "stub"
                if saved.get("stub", False)
                else percent_string(
                    saved["matching"], saved.get("effective", False), is_plain
                )
            )

            if name == "":
                name = saved.get("name", "")

        if show_both_addrs:
            addr_string = f"{orig_addr} / {recomp_addr:10}"
        else:
            addr_string = orig_addr

        # The ANSI codes from colorama counted towards string length,
        # so displaying this as an ascii-like spreadsheet
        # (using f-string formatting) would take some effort.
        return f"{addr_string} - {name} ({old_pct} -> {new_pct})"

    return formatter


def diff_json(
    saved_data,
    new_data,
    orig_file: str,
    show_both_addrs: bool = False,
    is_plain: bool = False,
):
    """Using a saved copy of the diff summary and the current data, print a
    report showing which functions/symbols have changed match percentage."""

    # Don't try to diff a report generated for a different binary file
    base_file = os.path.basename(orig_file).lower()

    if saved_data.get("file") != base_file:
        logging.getLogger().error(
            "Diff report for '%s' does not match current file '%s'",
            saved_data.get("file"),
            base_file,
        )
        return

    if "timestamp" in saved_data:
        now = datetime.now().replace(microsecond=0)
        then = datetime.fromtimestamp(saved_data["timestamp"]).replace(microsecond=0)

        print(
            " ".join(
                [
                    "Saved diff report generated",
                    then.strftime("%B %d %Y, %H:%M:%S"),
                    f"({str(now - then)} ago)",
                ]
            )
        )

        print()

    # Convert to dict, using orig_addr as key
    saved_invert = {obj["address"]: obj for obj in saved_data["data"]}
    new_invert = {obj["address"]: obj for obj in new_data}

    all_addrs = set(saved_invert.keys()).union(new_invert.keys())

    # Put all the information in one place so we can decide how each item changed.
    combined = {
        addr: (
            saved_invert.get(addr),
            new_invert.get(addr),
        )
        for addr in sorted(all_addrs)
    }

    # The criteria for diff judgement is in these dict comprehensions:
    # Any function not in the saved file
    new_functions = {
        key: (saved, new) for key, (saved, new) in combined.items() if saved is None
    }

    # Any function now missing from the saved file
    # or a non-stub -> stub conversion
    dropped_functions = {
        key: (saved, new)
        for key, (saved, new) in combined.items()
        if new is None
        or (
            new is not None
            and saved is not None
            and new.get("stub", False)
            and not saved.get("stub", False)
        )
    }

    # TODO: move these two into functions if the assessment gets more complex
    # Any function with increased match percentage
    # or stub -> non-stub conversion
    improved_functions = {
        key: (saved, new)
        for key, (saved, new) in combined.items()
        if saved is not None
        and new is not None
        and (
            new["matching"] > saved["matching"]
            or (not new.get("stub", False) and saved.get("stub", False))
        )
    }

    # Any non-stub function with decreased match percentage
    degraded_functions = {
        key: (saved, new)
        for key, (saved, new) in combined.items()
        if saved is not None
        and new is not None
        and new["matching"] < saved["matching"]
        and not saved.get("stub")
        and not new.get("stub")
    }

    # Any function with former or current "effective" match
    entropy_functions = {
        key: (saved, new)
        for key, (saved, new) in combined.items()
        if saved is not None
        and new is not None
        and new["matching"] == 1.0
        and saved["matching"] == 1.0
        and new.get("effective", False) != saved.get("effective", False)
    }

    get_diff_str = diff_json_display(show_both_addrs, is_plain)

    for diff_name, diff_dict in [
        ("New", new_functions),
        ("Increased", improved_functions),
        ("Decreased", degraded_functions),
        ("Dropped", dropped_functions),
        ("Compiler entropy", entropy_functions),
    ]:
        if len(diff_dict) == 0:
            continue

        print(f"{diff_name} ({len(diff_dict)}):")

        for addr, (saved, new) in diff_dict.items():
            print(get_diff_str(addr, saved, new))

        print()


def get_file_in_script_dir(fn):
    return os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), fn)