Commit 0afaaf6c authored by Jan Kiene's avatar Jan Kiene
Browse files

Merge branch 'kiene/ubsan-error-reporting' into 'main'

[CI] sanitizer error reporting helper script

See merge request !2444
parents 507fd13e 1f2840a3
Loading
Loading
Loading
Loading
Loading
+291 −0
Original line number Diff line number Diff line
#!/usr/env python3

from numpy import trace
import pandas as pd
from xml.etree import ElementTree
import argparse
from enum import Enum
from typing import List, Tuple
import re
import os
from pathlib import Path
import logging


logging.basicConfig(level=logging.INFO)


class SanitizerError:
    SUMMARY_ID = ""

    def __init__(
        self, traceback: str, commandlines: dict, testcase: str, cwd: Path = Path(".")
    ) -> None:
        self.traceback = traceback
        self.commandlines = commandlines
        self.testcase = testcase
        self.type, self.location = self.parse_type_and_location(traceback, cwd)

    def __hash__(self):
        return hash(self.location)

    def __eq__(self, other):
        return self.location == other.location

    def __repr__(self):
        return f"<{self.__class__.__name__} at {self.location}>"

    def __lt__(self, other):
        # order by string comparison of location as first criterion
        # if location is the same in both instances, the smaller one is the one with more found command lines
        if self.location != other.location:
            return self.location < other.location
        else:
            num_cmdl_self = list(self.commandlines.values()).count("")
            num_cmdl_other = list(other.commandlines.values()).count("")
            return num_cmdl_self > num_cmdl_other

    def to_dict(self) -> dict:
        return {
            "testcase": self.testcase,
            "sanitizer": self.__class__.__name__.replace("Error", "").upper(),
            "location": self.location,
            "type": self.type,
            "traceback": self.traceback,
            **self.commandlines,
        }

    def parse_type_and_location(self, traceback, cwd) -> Tuple[str, str]:
        last_line = traceback.split("\n")[-1].strip()
        assert last_line.startswith(f"SUMMARY: {self.SUMMARY_ID}")
        m = re.match(
            r"SUMMARY: " + self.SUMMARY_ID + r": ([a-z-]*) (.*\/.*\.[ch]:\d+:\d+) in",
            last_line,
        )
        assert m is not None

        type, location = m.groups()

        if Path(location).is_absolute():
            location = str(Path(location).relative_to(cwd))
        return type, location


class UsanError(SanitizerError):
    SUMMARY_ID = "UndefinedBehaviorSanitizer"


class MsanError(SanitizerError):
    SUMMARY_ID = "MemorySanitizer"


class AsanError(SanitizerError):
    SUMMARY_ID = "AddressSanitizer"

    def parse_type_and_location(self, traceback, cwd) -> Tuple[str, str]:
        first_line = traceback.split("\n")[0].strip()

        type = ""
        location = ""
        if "AddressSanitizer" in first_line:
            last_line = traceback.split("\n")[-1].strip()
            assert last_line.startswith(f"SUMMARY: {self.SUMMARY_ID}")
            m = re.match(
                r"SUMMARY: "
                + self.SUMMARY_ID
                + r": ([a-z-]*) (.*\/.*\.[ch]:\d+:\d+) in",
                last_line,
            )
            assert m is not None

            type, location = m.groups()
        elif "LeakSanitizer" in first_line:
            type = "memory leaks"

            # for location, we just pick from the first leak, even if there are more in there
            # perfect accurac not needed here
            for line in traceback.split("\n"):
                # this assumes that number #0 always is the executable itself and has no file associated
                if line.strip().startswith("#1"):
                    location = line.split()[-1]
                    break
        else:
            raise NotImplementedError("Unknown Asan type")

        if Path(location).is_absolute():
            location = str(Path(location).relative_to(cwd))
        return type, location


def parse_commandlines_from_sysout(sysout: str, cwd: Path) -> dict:
    commandlines = {
        "IVAS_cod": "",
        "networkSimulator_g192": "",
        "eid-xor": "",
        "IVAS_dec": "",
        "IVAS_rend": "",
        "ISAR_post_rend": "",
    }
    for line in sysout.splitlines():
        for exe in commandlines:
            # search for name of executable in line
            # it is repeated in the sanitizer traceback, hence the "not in" part
            # the "not at the start" condition is for eid-xor (there are also lines like this: "eid-xor command:")
            # the "does not contain CalledProcessError" is for the renderer tests
            if (
                re.search(exe, line) is not None
                and " in _start " not in line
                and not line.strip().startswith(exe)
                and "CalledProcessError" not in line
            ):
                if commandlines[exe] != "":
                    logging.debug(
                        f"Commandline for {exe} already found, skip second one."
                    )
                else:
                    commandlines[exe] = postprocess_cmdline(line.strip(), cwd, exe)

                # assumption: only one commandline per line
                break

    return commandlines


def postprocess_cmdline(cmdline: str, cwd: Path, exe: str) -> str:
    # only use line with commandline from the token that includes the exe name
    # reason again the renderer tests...
    idx = 0
    for elem in cmdline.split():
        if exe in elem:
            idx = cmdline.index(elem)

    cmdline_split = cmdline[idx:].split()
    cmdline_proc = []

    # change absolute paths into relative ones
    # remove the "quite" flag
    # for output and bitstream files only keep the filename
    for elem in cmdline_split:
        if elem == "-q":
            continue
        elif (elem_as_path := Path(elem)).is_absolute():
            if (
                elem_as_path.suffix == ".192"
                or elem_as_path.suffix == ".netsimtrace"
                or (
                    elem_as_path.suffix == ".wav"
                    and cmdline_split.index(elem) == len(cmdline_split) - 1
                )
            ):
                cmdline_proc.append(elem_as_path.name)
            else:
                cmdline_proc.append(str(elem_as_path.relative_to(cwd)))
        else:
            cmdline_proc.append(elem)

    return " ".join(cmdline_proc)


def parse_errors_from_sysout(
    sysout: str, testcase_name: str, cwd: Path
) -> List[UsanError]:
    logging.debug(testcase_name)
    commandlines = parse_commandlines_from_sysout(sysout, cwd)
    errors = []

    class ParserState(Enum):
        OUT = 0
        IN = 1

    pattern_usan = re.compile(r"(lib_.+|apps)\/(.*\.[ch]):(\d+):(\d+): runtime error:")
    pattern_msan = re.compile(r" MemorySanitizer: ")
    pattern_asan = re.compile(r"==\d+==ERROR: .+Sanitizer: ")

    state = ParserState.OUT
    accu = []
    err_cls = None
    for l in sysout.splitlines():
        # hack for the weird renderer cases
        line = l.removeprefix("E")

        line = line.strip()

        m_usan = re.search(pattern_usan, line)
        m_msan = re.search(pattern_msan, line)
        m_asan = re.search(pattern_asan, line)

        usan_start_found = m_usan is not None
        msan_start_found = m_msan is not None and not line.startswith("SUMMARY:")
        asan_start_found = m_asan is not None

        matches_found = sum([usan_start_found, msan_start_found, asan_start_found])
        assert matches_found <= 1

        if matches_found > 0:
            assert state == ParserState.OUT
            state = ParserState.IN
            accu = []
            err_cls = (
                UsanError
                if m_usan is not None
                else MsanError
                if m_msan is not None
                else AsanError
            )

        if state == ParserState.IN:
            accu.append(line)

        if line.startswith("SUMMARY:"):
            assert state == ParserState.IN

            errors.append(err_cls("\n".join(accu), commandlines, testcase_name, cwd))
            state = ParserState.OUT

    return errors


def main(args):
    tree = ElementTree.parse(args.xml_report)
    root = tree.getroot()

    errors = []
    for tc in root[0].findall("testcase"):
        tc_name = tc.attrib["name"]
        errors_found = []
        for sysout in tc.findall("system-out"):
            errors_found.extend(
                parse_errors_from_sysout(sysout.text, tc_name, args.inject_cwd)
            )
        # hack for weird renderer tests that don't play the same game as everyone else...
        # if we don't find anything in system-out, try in failure
        if len(errors_found) > 0:
            errors.extend(errors_found)
            continue

        for failure in tc.findall("failure"):
            errors_found.extend(
                parse_errors_from_sysout(failure.text, tc_name, args.inject_cwd)
            )
        errors.extend(errors_found)

    unique_errors = list(sorted(set(sorted(errors))))
    print(f"Found {len(unique_errors)} unique errors")

    df = pd.DataFrame([e.to_dict() for e in unique_errors])
    df.to_csv(args.outfile, index=False)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("xml_report")
    parser.add_argument("outfile")
    parser.add_argument(
        "--inject_cwd",
        help="Use this as cwd when pruning the long paths in the command lines. Debug option for testing.",
        default=Path(os.getcwd()).absolute(),
        type=Path,
    )

    args = parser.parse_args()
    main(args)
+1 −0
Original line number Diff line number Diff line
@@ -32,6 +32,7 @@ implicit-signed-integer-truncation:lib_dec.c
implicit-signed-integer-truncation:longarith.c
implicit-signed-integer-truncation:tcq_position_arith.c
implicit-signed-integer-truncation:tools.c
implicit-signed-integer-truncation:ivas_objectRenderer_hrFilt.c
shift-base:basop32.c
shift-base:enh40.c
shift-base:enh40.h