Unverified Commit 4f6b4a0f authored by norvell's avatar norvell
Browse files

Add scripts/find_regressions_from_logs2.py as alternative analysis script

parent 29820a98
Loading
Loading
Loading
Loading
Loading
+189 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3

import argparse
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def main(logs_dir, output_filename, measure, days, all_results, diff_thr, ratio_thr, curr_value_thr):

    input_path = Path(logs_dir)
    logs = [f for f in input_path.iterdir() if f.is_dir()]

    # Build dict of scores
    formatdict = {}
    sha = {}
    logdict = {}
    for log in logs:
        date = log.name
        logdict[date] = {}
        formatdict[date] = {}
        for logfile in log.glob("*.csv"):
            tmp = logfile.name.split("-")
            job = "-".join(tmp[3:-4])
            sha[date] = tmp[-1].split(".")[0]
            data = pd.read_csv(logfile, usecols=["testcase", measure, "format"])
            logdict[date][job] = {}
            formatdict[date][job] = {}

            for testcase, value, format in zip(
                data["testcase"], data[measure], data["format"]
            ):
                formatdict[date][job][testcase] = format
                logdict[date][job][testcase] = value

    # Restructure dict
    csv_rows = []
    formats = []
    for date, jobs in logdict.items():
        for job, testcases in jobs.items():
            for testcase, value in testcases.items():
                csv_rows.append((job, testcase, date, value))
                formats.append((job, testcase, date, formatdict[date][job][testcase]))

    result = pd.DataFrame(csv_rows, columns=["job", "testcase", "date", "value"])
    result = result.pivot(
        index=["job", "testcase"], columns="date", values="value"
    ).reset_index()

    f = pd.DataFrame(formats, columns=["job", "testcase", "date", "format"])
    f = f.pivot(
        index=["job", "testcase"], columns="date", values="format"
    ).reset_index()

    ratio = result.copy()
    dates = result.iloc[:, 2:].columns

    # Calculate ratios
    ratio[dates[0]] = 1 # Set first ratio to 1
    for prevdate, currdate in zip( dates[0:-1], dates[1:]):
        ratio[currdate] = result[currdate] / result[prevdate]

    values = result.iloc[:, 2:]
    date = values.columns
    last_date = date[-1]
    result.insert(2, "format", f[last_date])
    ratio.insert(2, "format", f[last_date])

    formats = result['format'].dropna().unique().tolist()

    plotdata = pd.DataFrame(0.0, index=formats, columns=dates[-(days+1):-1])
    plottext = pd.DataFrame("", index=formats, columns=dates[-(days+1):-1])

    for i in range(days):
        currdate = dates[-(days-i+1)] # Make robust for shorter history
        prevdate = dates[-(days-i+2)]
        idx = ratio.groupby("format")[currdate].nlargest(10).index.get_level_values(1)
        tmp = result[["job","testcase","format",prevdate,currdate]].iloc[idx,:].copy().reset_index()
        tmp.insert(3, "prev_date", prevdate)
        tmp.insert(4, "prev_sha", sha[prevdate])
        tmp.insert(5, "curr_date", currdate)
        tmp.insert(6, "curr_sha", sha[prevdate])
        tmp.insert(7, "diff", tmp[currdate] - tmp[prevdate])
        tmp.insert(8, "ratio", tmp[currdate] / tmp[prevdate])
        tmp.loc[tmp[prevdate] == 0, "ratio"] = (
            1  # Set ratio to 1 for denominator 0
        )
        tmp["prev_sha"] = "'" + tmp["prev_sha"] # Add apostrophy to prevent Excel reading this as a number
        tmp["curr_sha"] = "'" + tmp["curr_sha"] # Add apostrophy to prevent Excel reading this as a number

        csv_filename = f"regressions_{measure}_{currdate}.csv"
        tmp.to_csv(csv_filename, sep=";", index=False)

        # Store worst case per format for plotting
        idx = tmp.groupby("format")["ratio"].idxmax()
        for f in formats:
            plotdata.loc[f, currdate] = tmp.iloc[idx[f]]["ratio"]
            plottext.loc[f, currdate] = f"{tmp.iloc[idx[f]]['job']} - {tmp.iloc[idx[f]]['testcase']} - Max {measure} ratio: {tmp.iloc[idx[f]]['ratio']:.2f}"

    fig = make_subplots(
        rows=5,
        cols=2,
        subplot_titles=[f"{i}" for i in formats],
        shared_xaxes="columns",
    )

    for i, fmt in enumerate(formats):
        row = i // 2 + 1
        col = i % 2 + 1

        fig.add_trace(
            go.Scatter(
                x=pd.to_datetime(plotdata.columns),
                y=plotdata.loc[fmt],
                mode="lines+markers",
                name=f"Max {measure}",
                hovertext=plottext.loc[fmt],
                hoverinfo="text",
                showlegend=False,
            ),
            row=row,
            col=col,
        )

    fig.update_layout(
        title_text=f"Regression detection: Max {measure} ratio",
        legend=dict(x=1, y=1, orientation="v"),
        hovermode="x unified",
    )

    fig.update_xaxes(automargin=True)
    fig.update_yaxes(automargin=True)

    # Save to html
    fig.write_html(output_filename)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="logs dir")
    parser.add_argument(
        "logs_dir",
        type=str,
        help="Logs dir, e.g. logs",
    )
    parser.add_argument(
        "output_filename",
        type=str,
        help="Filename of the combined csv file. e.g mld.csv",
    )
    parser.add_argument(
        "--measure",
        type=str,
        help="Measure for summary, one of MLD MIN_SSNR MAX_ABS_DIFF MIN_ODG, (default: MLD)",
        default="MLD",
    )
    parser.add_argument(
        "--days",
        type=int,
        help="Number of days in history, (default: whole history)",
        default=-1,
    )
    parser.add_argument(
        "--all_results", 
        action="store_true",
        help="Output all results, including cases without regression (default: off)",
        default=False,        
    )
    parser.add_argument(
        "--diff_thr",
        type=float,
        help="Include test cases with diff above diff_thr, (default: 0.0)",
        default=0.0,
    )
    parser.add_argument(
        "--ratio_thr",
        type=float,
        help="Include test cases with ratio above ratio_thr, (default: 1.0)",
        default=1.0,
    )
    parser.add_argument(
        "--curr_value_thr",
        type=float,
        help="Include test cases with curr_value above curr_value_thr, (default: 0.0)",
        default=0.0,
    )

    args = parser.parse_args()

    main(args.logs_dir, args.output_filename, args.measure, args.days, args.all_results, args.diff_thr, args.ratio_thr, args.curr_value_thr)