Commit abe55b66 authored by norvell's avatar norvell
Browse files

Merge branch 'ci/fix-long-term-logs' into 'main'

[CI] Simplify and split long-term-regression plots

See merge request !2507
parents df9f5c42 972fa593
Loading
Loading
Loading
Loading
Loading
+139 −71
Original line number Diff line number Diff line
#!/usr/bin/env python3

import os
import pandas as pd
import argparse
import plotly.express as px
import re
import plotly.graph_objects as go
from plotly.subplots import make_subplots

@@ -23,15 +25,18 @@ def read_csv_files(root_dir):


def parse_csv_data(csv_data):
    """keep 'testcase', 'format', 'MLD', 'MAX_ABS_DIFF'  and add
    'date' column."""
    cols_to_keep = ["testcase", "format", "MLD", "MAX_ABS_DIFF"]
    """keep 'testcase', 'format', 'MLD', 'MAX_ABS_DIFF', 'MIN_ODG', 'MIN_SSNR'  and add
    'date' and 'job' column."""
    cols_to_keep = ["testcase", "format", "MLD", "MAX_ABS_DIFF", "MIN_ODG", "MIN_SSNR"]
    parsed_data = {}
    for key, df in csv_data.items():
        tmp = key.split("-")
        job = "-".join(tmp[4:-4])
        cols = [col for col in cols_to_keep if col in df.columns]
        date = os.path.basename(os.path.dirname(key))
        new_df = df[cols].copy()
        new_df["date"] = date
        new_df["job"] = job
        parsed_data[key] = new_df

    # concatenate all dataframe in the dictionary
@@ -39,38 +44,67 @@ def parse_csv_data(csv_data):
    return concat_df


def plot_data(df, output_filename):
    """plot max values for 'MLD' and 'MAX_ABS_DIFF' data and save
def plot_data(df, args):
    """plot max values for measure and data and save
    to html file."""

    measure = args.measure
    days = args.days

    # Convert 'date' to datetime
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df["MLD"] = pd.to_numeric(df["MLD"], errors="coerce")
    df["MAX_ABS_DIFF"] = pd.to_numeric(df["MAX_ABS_DIFF"], errors="coerce")
    df[measure] = pd.to_numeric(df[measure], errors="coerce")

    # Filter out rows older than "days"
    cutoff = df["date"].max() - pd.Timedelta(days=days)
    df = df[df["date"] > cutoff].reset_index(drop=True)

    # Drop rows with NaT and NaN
    clean_df = df.dropna(subset=["date", "MLD", "MAX_ABS_DIFF"])
    df = df.dropna(subset=["date", measure])

    # Filter test cases based on include/reject/match arguments
    if args.include:
        mask = pd.Series(False, index=df.index)
        for tag in args.include:
            mask |= df["testcase"].str.contains(tag, case=False, na=False)
        df = df[mask]
    if args.reject:
        mask = pd.Series(False, index=df.index)
        for tag in args.reject:
            mask |= df["testcase"].str.contains(tag, case=False, na=False)
        df = df[~mask]
    if args.match:
        pattern = re.compile(args.match, re.IGNORECASE)
        df = df[df["testcase"].str.contains(pattern, na=False)]

    # Filter jobs based on job-include/job-reject/job-match arguments
    if args.job_include:
        mask = pd.Series(False, index=df.index)
        for tag in args.job_include:
            mask |= df["job"].str.contains(tag, case=False, na=False)
        df = df[mask]
    if args.job_reject:
        mask = pd.Series(False, index=df.index)
        for tag in args.job_reject:
            mask |= df["job"].str.contains(tag, case=False, na=False)
        df = df[~mask]
    if args.job_match:
        pattern = re.compile(args.job_match, re.IGNORECASE)
        df = df[df["job"].str.contains(pattern, na=False)]

    # Group by 'format' and 'date' to get rows with max 'MLD' per group
    max_mld = (
        clean_df.groupby(["format", "date"])
        .apply(lambda x: x.loc[x["MLD"].idxmax()])
        .reset_index(drop=True)
    )

    # Group by 'format' and 'date' to get rows with max 'MAX_ABS_DIFF' per
    # group
    max_abs_diff = (
        clean_df.groupby(["format", "date"])
        .apply(lambda x: x.loc[x["MAX_ABS_DIFF"].idxmax()])
        .reset_index(drop=True)
    )
    # Group by 'format' and 'date' to get rows with max 'MLD' per group
    idx = df.groupby(["format", "date"])[measure].idxmax()
    max = df.loc[idx].reset_index(drop=True)
    idx = df.groupby(["format", "date"])[measure].idxmin()
    min = df.loc[idx].reset_index(drop=True)
    mean = df.groupby(["format", "date"])[measure].mean().to_frame("mean").reset_index()

    formats = sorted(clean_df["format"].unique())
    formats = sorted(df["format"].unique())

    fig = make_subplots(
        rows=5,
        cols=2,
        specs=[[{"secondary_y": True}] * 2] * 5,
        subplot_titles=[f"{i}" for i in formats],
        shared_xaxes="columns",
    )
@@ -79,64 +113,65 @@ def plot_data(df, output_filename):
        row = i // 2 + 1
        col = i % 2 + 1

        data_mld = max_mld[max_mld["format"] == fmt].sort_values("date")
        data_diff = max_abs_diff[max_abs_diff["format"]
                                 == fmt].sort_values("date")
        if "MIN" in measure:
            data = min[min["format"] == fmt].sort_values("date")
            maxmin_str = "Min"
        else:    
            data = max[max["format"] == fmt].sort_values("date")
            maxmin_str = "Max"

        # Add max 'MLD' to primary y-axis
        # Add max measure to plots
        fig.add_trace(
            go.Scatter(
                x=data_mld["date"],
                y=data_mld["MLD"],
                x=data["date"],
                y=data[measure],
                mode="lines+markers",
                name=f" {fmt} - Max MLD",
                name=f"{maxmin_str} {measure}",
                hovertext=[
                    f"Testcase: {tc}<br>MLD: {mld:.4f}<br>MAX_ABS_DIFF:"
                    f"{abs_diff}<br>Format:"
                    f" {format}<br>Date: {date.date()}"
                    for tc, mld, abs_diff, format, date in zip(
                        data_mld["testcase"],
                        data_mld["MLD"],
                        data_mld["MAX_ABS_DIFF"],
                        data_mld["format"],
                        data_mld["date"],
                    f"Testcase: {tc}<br>{maxmin_str} {measure}: {value:.4f}"
                    f"<br>Job: {job}"
                    f"<br>Date: {date.date()}"
                    for job, tc, value, date in zip(
                        data["job"],
                        data["testcase"],
                        data[measure],
                        data["date"],
                    )
                ],
                hoverinfo="text",
                marker_color="red",
                showlegend=(i == 0),
            ),
            row=row,
            col=col,
            secondary_y=False,
        )

        # Add max 'MAX_ABS_DIFF' to secondary y-axis
        data = mean[mean["format"] == fmt].sort_values("date")

        # Add mean measure to plots
        fig.add_trace(
            go.Scatter(
                x=data_diff["date"],
                y=data_diff["MAX_ABS_DIFF"],
                x=data["date"],
                y=data["mean"],
                mode="lines+markers",
                name=f"{fmt} - Max MAX_ABS_DIFF",
                name=f"Mean {measure}",
                hovertext=[
                    f"Testcase: {tc}<br>MLD: {mld:.4f}<br>MAX_ABS_DIFF:"
                    f" {abs_diff:.4f}<br>Format:"
                    f" {format}<br>Date: {date.date()}"
                    for tc, mld, abs_diff, format, date in zip(
                        data_diff["testcase"],
                        data_diff["MLD"],
                        data_diff["MAX_ABS_DIFF"],
                        data_diff["format"],
                        data_diff["date"],
                    f"Mean {measure}: {value:.4f}" f"<br>Date: {date.date()}"
                    for value, date in zip(
                        data["mean"],
                        data["date"],
                    )
                ],
                hoverinfo="text",
                marker_color="blue",
                showlegend=(i == 0),
            ),
            row=row,
            col=col,
            secondary_y=True,
        )

    fig.update_layout(
        title_text="Long-term regression: max MLD and max MAX_ABS_DIFF",
        title_text=f"History: {measure}",
        legend=dict(x=1, y=1, orientation="v"),
        hovermode="x unified",
    )
@@ -144,21 +179,8 @@ def plot_data(df, output_filename):
    fig.update_xaxes(automargin=True)
    fig.update_yaxes(automargin=True)

    # Update y-axes titles per subplot
    for i in range(10):
        yaxis_num = i * 2 + 1
        yaxis2_num = yaxis_num + 1
        fig["layout"][f"yaxis{yaxis_num}"].update(
            title="Max MLD", titlefont=dict(color="blue"), tickfont=dict(color="blue")
        )
        fig["layout"][f"yaxis{yaxis2_num}"].update(
            title="Max MAX_ABS_DIFF",
            titlefont=dict(color="green"),
            tickfont=dict(color="green"),
        )

    # Save to html
    fig.write_html(output_filename)
    fig.write_html(args.output_filename)


if __name__ == "__main__":
@@ -173,8 +195,54 @@ if __name__ == "__main__":
        type=str,
        help="Filename of the generated plot. e.g" ". long_term_regression.html",
    )
    parser.add_argument(
        "--days",
        type=int,
        help="Number of days in history. Default: 30",
        default=30,
    )
    parser.add_argument(
        "--measure",
        type=str,
        help="Measure for analysis: MLD, MAX_ABS_DIFF, MIN_ODG, MIN_SSNR, default: MLD",
        default="MLD",
    )
    parser.add_argument(
        "--include",
        nargs="+",
        type=str,
        help="List of tags to include in testcases",
    )
    parser.add_argument(
        "--reject",
        nargs="+",
        type=str,
        help="List of tags to reject in testcases",
    )
    parser.add_argument(
        "--match",
        type=str,
        help="Regex pattern for selecting testcases",
    )
    parser.add_argument(
        "--job-include",
        nargs="+",
        type=str,
        help="List of tags to include in jobs",
    )
    parser.add_argument(
        "--job-reject",
        nargs="+",
        type=str,
        help="List of tags to reject in jobs",
    )
    parser.add_argument(
        "--job-match",
        type=str,
        help="Regex pattern for selecting jobs",
    )
    args = parser.parse_args()

    csv_data = read_csv_files(args.root_dir)
    data = parse_csv_data(csv_data)
    plot_data(data, args.output_filename)
    plot_data(data, args)
+194 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3

import argparse
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def main(args):

    logs_dir = args.logs_dir
    output_filename = args.output_filename
    measure = args.measure
    days = args.days

    input_path = Path(logs_dir)
    logs = [f for f in input_path.iterdir() if f.is_dir()]

    # Build dict of scores
    formatdict = {}
    sha = {}
    logdict = {}
    for log in logs:
        date = log.name
        logdict[date] = {}
        formatdict[date] = {}
        for logfile in log.glob("*.csv"):
            tmp = logfile.name.split("-")
            job = "-".join(tmp[3:-4])
            sha[date] = tmp[-1].split(".")[0]
            data = pd.read_csv(logfile, usecols=["testcase", measure, "format"])
            logdict[date][job] = {}
            formatdict[date][job] = {}

            for testcase, value, format in zip(
                data["testcase"], data[measure], data["format"]
            ):
                formatdict[date][job][testcase] = format
                logdict[date][job][testcase] = value

    # Restructure dict
    csv_rows = []
    formats = []
    for date, jobs in logdict.items():
        for job, testcases in jobs.items():
            for testcase, value in testcases.items():
                csv_rows.append(
                    (job, testcase, formatdict[date][job][testcase], date, value)
                )

    result = pd.DataFrame(
        csv_rows, columns=["job", "testcase", "format", "date", "value"]
    )
    result = result.pivot(
        index=["job", "testcase", "format"], columns="date", values="value"
    ).reset_index()

    # Keep only tests for which results exist in any of the days
    if days == -1:
        rng = result.columns[3:]  # Whole history
    else:
        rng = result.columns[-days:]
    result = result.dropna(subset=rng)
    result = result.reset_index(drop=True)

    ratio = result.copy()
    ratio = ratio.reset_index()
    dates = result.iloc[:, 3:].columns

    # Calculate ratios
    ratio[dates[0]] = 1.0  # Set first ratio to 1.0
    for prevdate, currdate in zip(dates[0:-1], dates[1:]):
        ratio[currdate] = result[currdate] / result[prevdate]

    values = result.iloc[:, 3:]
    date = values.columns

    formats = result["format"].dropna().unique().tolist()

    plotdata = pd.DataFrame(0.0, index=formats, columns=dates[-(days) : -1])
    plottext = pd.DataFrame("", index=formats, columns=dates[-(days) : -1])

    all_indices = []

    for i in range(days):
        currdate = dates[-(days - i)]  # Make robust for shorter history
        prevdate = dates[-(days - i + 1)]

        idx = ratio.groupby("format")[currdate].idxmax()
        all_indices.extend(idx.tolist())

        # Store worst case per format for plotting
        for f in formats:
            plotdata.loc[f, currdate] = ratio.iloc[idx[f]][currdate]
            plottext.loc[f, currdate] = (
                f"Job: {result.iloc[idx[f]]['job']}<br>Testcase: {result.iloc[idx[f]]['testcase']} <br>Max {measure} ratio: {ratio.iloc[idx[f]][currdate]:.2f}<br>Date: {currdate}"
            )

    fig = make_subplots(
        rows=5,
        cols=2,
        subplot_titles=[f"{i}" for i in formats],
        shared_xaxes="columns",
    )

    for i, fmt in enumerate(formats):
        row = i // 2 + 1
        col = i % 2 + 1

        fig.add_trace(
            go.Scatter(
                x=pd.to_datetime(plotdata.columns),
                y=plotdata.loc[fmt],
                mode="lines+markers",
                name=f"Max {measure}",
                hovertext=plottext.loc[fmt],
                hoverinfo="text",
                showlegend=False,
            ),
            row=row,
            col=col,
        )

    fig.update_layout(
        title_text=f"Regression detection: Max {measure} ratio",
        legend=dict(x=1, y=1, orientation="v"),
        hovermode="x unified",
    )

    fig.update_xaxes(automargin=True)
    fig.update_yaxes(automargin=True)

    # Save to html
    fig.write_html(output_filename)

    # Write CSV-file
    if args.csv:
        output = result.iloc[all_indices].copy()
        cols = ["job","testcase","format"]
        cols.extend(date[-days:].tolist())
        output = output.loc[:,cols]
        values = output.iloc[:, 3:]
        last_date = values.columns[-1]
        output.insert(3, "min_date", values.idxmin(axis=1))
        output.insert(4, "min_sha", output["min_date"].map(sha))
        output.insert(5, "curr_value", output[last_date])
        output.insert(6, "min_value", values.min(axis=1))
        output.insert(7, "diff", output["curr_value"] - output["min_value"])
        output.insert(8, "ratio", output["curr_value"] / output["min_value"])
        output.loc[output["min_value"] == 0, "ratio"] = (
            1  # Set ratio to 1 for denominator 0
        )
        output["min_sha"] = (
            "'" + output["min_sha"]
        )  # Add apostrophy to prevent Excel reading this as a number
        output.sort_values(
            by=["format", "ratio"], ascending=[True, False], inplace=True
        )
        output.to_csv(args.csv, sep=";", index=False)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="logs dir")
    parser.add_argument(
        "logs_dir",
        type=str,
        help="Logs dir, e.g. logs",
    )
    parser.add_argument(
        "output_filename",
        type=str,
        help="Output html file. e.g mld.html",
    )
    parser.add_argument(
        "--measure",
        type=str,
        help="Measure for summary, one of MLD MIN_SSNR MAX_ABS_DIFF MIN_ODG, (default: MLD)",
        default="MLD",
    )
    parser.add_argument(
        "--days",
        type=int,
        help="Number of days in history, (default: whole history)",
        default=-1,
    )
    parser.add_argument(
        "--csv",
        type=str,
        help="CSV output file",
    )

    args = parser.parse_args()
    main(args)