diff --git a/ci/process_long_term_logs.py b/ci/process_long_term_logs.py index baabc7d93a1c28bf0804b9a939a6968810675fb7..1eec7fff35d0c9810b1f1d5f332c04f3b364900e 100644 --- a/ci/process_long_term_logs.py +++ b/ci/process_long_term_logs.py @@ -1,7 +1,9 @@ +#!/usr/bin/env python3 + import os import pandas as pd import argparse -import plotly.express as px +import re import plotly.graph_objects as go from plotly.subplots import make_subplots @@ -23,15 +25,18 @@ def read_csv_files(root_dir): def parse_csv_data(csv_data): - """keep 'testcase', 'format', 'MLD', 'MAX_ABS_DIFF' and add - 'date' column.""" - cols_to_keep = ["testcase", "format", "MLD", "MAX_ABS_DIFF"] + """keep 'testcase', 'format', 'MLD', 'MAX_ABS_DIFF', 'MIN_ODG', 'MIN_SSNR' and add + 'date' and 'job' column.""" + cols_to_keep = ["testcase", "format", "MLD", "MAX_ABS_DIFF", "MIN_ODG", "MIN_SSNR"] parsed_data = {} for key, df in csv_data.items(): + tmp = key.split("-") + job = "-".join(tmp[4:-4]) cols = [col for col in cols_to_keep if col in df.columns] date = os.path.basename(os.path.dirname(key)) new_df = df[cols].copy() new_df["date"] = date + new_df["job"] = job parsed_data[key] = new_df # concatenate all dataframe in the dictionary @@ -39,38 +44,67 @@ def parse_csv_data(csv_data): return concat_df -def plot_data(df, output_filename): - """plot max values for 'MLD' and 'MAX_ABS_DIFF' data and save +def plot_data(df, args): + """plot max values for measure and data and save to html file.""" + + measure = args.measure + days = args.days + # Convert 'date' to datetime df["date"] = pd.to_datetime(df["date"], errors="coerce") - df["MLD"] = pd.to_numeric(df["MLD"], errors="coerce") - df["MAX_ABS_DIFF"] = pd.to_numeric(df["MAX_ABS_DIFF"], errors="coerce") + df[measure] = pd.to_numeric(df[measure], errors="coerce") + + # Filter out rows older than "days" + cutoff = df["date"].max() - pd.Timedelta(days=days) + df = df[df["date"] > cutoff].reset_index(drop=True) # Drop rows with NaT and NaN - clean_df = df.dropna(subset=["date", "MLD", "MAX_ABS_DIFF"]) + df = df.dropna(subset=["date", measure]) + + # Filter test cases based on include/reject/match arguments + if args.include: + mask = pd.Series(False, index=df.index) + for tag in args.include: + mask |= df["testcase"].str.contains(tag, case=False, na=False) + df = df[mask] + if args.reject: + mask = pd.Series(False, index=df.index) + for tag in args.reject: + mask |= df["testcase"].str.contains(tag, case=False, na=False) + df = df[~mask] + if args.match: + pattern = re.compile(args.match, re.IGNORECASE) + df = df[df["testcase"].str.contains(pattern, na=False)] + + # Filter jobs based on job-include/job-reject/job-match arguments + if args.job_include: + mask = pd.Series(False, index=df.index) + for tag in args.job_include: + mask |= df["job"].str.contains(tag, case=False, na=False) + df = df[mask] + if args.job_reject: + mask = pd.Series(False, index=df.index) + for tag in args.job_reject: + mask |= df["job"].str.contains(tag, case=False, na=False) + df = df[~mask] + if args.job_match: + pattern = re.compile(args.job_match, re.IGNORECASE) + df = df[df["job"].str.contains(pattern, na=False)] - # Group by 'format' and 'date' to get rows with max 'MLD' per group - max_mld = ( - clean_df.groupby(["format", "date"]) - .apply(lambda x: x.loc[x["MLD"].idxmax()]) - .reset_index(drop=True) - ) - # Group by 'format' and 'date' to get rows with max 'MAX_ABS_DIFF' per - # group - max_abs_diff = ( - clean_df.groupby(["format", "date"]) - .apply(lambda x: x.loc[x["MAX_ABS_DIFF"].idxmax()]) - .reset_index(drop=True) - ) + # Group by 'format' and 'date' to get rows with max 'MLD' per group + idx = df.groupby(["format", "date"])[measure].idxmax() + max = df.loc[idx].reset_index(drop=True) + idx = df.groupby(["format", "date"])[measure].idxmin() + min = df.loc[idx].reset_index(drop=True) + mean = df.groupby(["format", "date"])[measure].mean().to_frame("mean").reset_index() - formats = sorted(clean_df["format"].unique()) + formats = sorted(df["format"].unique()) fig = make_subplots( rows=5, cols=2, - specs=[[{"secondary_y": True}] * 2] * 5, subplot_titles=[f"{i}" for i in formats], shared_xaxes="columns", ) @@ -79,64 +113,65 @@ def plot_data(df, output_filename): row = i // 2 + 1 col = i % 2 + 1 - data_mld = max_mld[max_mld["format"] == fmt].sort_values("date") - data_diff = max_abs_diff[max_abs_diff["format"] - == fmt].sort_values("date") + if "MIN" in measure: + data = min[min["format"] == fmt].sort_values("date") + maxmin_str = "Min" + else: + data = max[max["format"] == fmt].sort_values("date") + maxmin_str = "Max" - # Add max 'MLD' to primary y-axis + # Add max measure to plots fig.add_trace( go.Scatter( - x=data_mld["date"], - y=data_mld["MLD"], + x=data["date"], + y=data[measure], mode="lines+markers", - name=f" {fmt} - Max MLD", + name=f"{maxmin_str} {measure}", hovertext=[ - f"Testcase: {tc}
MLD: {mld:.4f}
MAX_ABS_DIFF:" - f"{abs_diff}
Format:" - f" {format}
Date: {date.date()}" - for tc, mld, abs_diff, format, date in zip( - data_mld["testcase"], - data_mld["MLD"], - data_mld["MAX_ABS_DIFF"], - data_mld["format"], - data_mld["date"], + f"Testcase: {tc}
{maxmin_str} {measure}: {value:.4f}" + f"
Job: {job}" + f"
Date: {date.date()}" + for job, tc, value, date in zip( + data["job"], + data["testcase"], + data[measure], + data["date"], ) ], hoverinfo="text", + marker_color="red", + showlegend=(i == 0), ), row=row, col=col, - secondary_y=False, ) - # Add max 'MAX_ABS_DIFF' to secondary y-axis + data = mean[mean["format"] == fmt].sort_values("date") + + # Add mean measure to plots fig.add_trace( go.Scatter( - x=data_diff["date"], - y=data_diff["MAX_ABS_DIFF"], + x=data["date"], + y=data["mean"], mode="lines+markers", - name=f"{fmt} - Max MAX_ABS_DIFF", + name=f"Mean {measure}", hovertext=[ - f"Testcase: {tc}
MLD: {mld:.4f}
MAX_ABS_DIFF:" - f" {abs_diff:.4f}
Format:" - f" {format}
Date: {date.date()}" - for tc, mld, abs_diff, format, date in zip( - data_diff["testcase"], - data_diff["MLD"], - data_diff["MAX_ABS_DIFF"], - data_diff["format"], - data_diff["date"], + f"Mean {measure}: {value:.4f}" f"
Date: {date.date()}" + for value, date in zip( + data["mean"], + data["date"], ) ], hoverinfo="text", + marker_color="blue", + showlegend=(i == 0), ), row=row, col=col, - secondary_y=True, ) fig.update_layout( - title_text="Long-term regression: max MLD and max MAX_ABS_DIFF", + title_text=f"History: {measure}", legend=dict(x=1, y=1, orientation="v"), hovermode="x unified", ) @@ -144,21 +179,8 @@ def plot_data(df, output_filename): fig.update_xaxes(automargin=True) fig.update_yaxes(automargin=True) - # Update y-axes titles per subplot - for i in range(10): - yaxis_num = i * 2 + 1 - yaxis2_num = yaxis_num + 1 - fig["layout"][f"yaxis{yaxis_num}"].update( - title="Max MLD", titlefont=dict(color="blue"), tickfont=dict(color="blue") - ) - fig["layout"][f"yaxis{yaxis2_num}"].update( - title="Max MAX_ABS_DIFF", - titlefont=dict(color="green"), - tickfont=dict(color="green"), - ) - # Save to html - fig.write_html(output_filename) + fig.write_html(args.output_filename) if __name__ == "__main__": @@ -173,8 +195,54 @@ if __name__ == "__main__": type=str, help="Filename of the generated plot. e.g" ". long_term_regression.html", ) + parser.add_argument( + "--days", + type=int, + help="Number of days in history. Default: 30", + default=30, + ) + parser.add_argument( + "--measure", + type=str, + help="Measure for analysis: MLD, MAX_ABS_DIFF, MIN_ODG, MIN_SSNR, default: MLD", + default="MLD", + ) + parser.add_argument( + "--include", + nargs="+", + type=str, + help="List of tags to include in testcases", + ) + parser.add_argument( + "--reject", + nargs="+", + type=str, + help="List of tags to reject in testcases", + ) + parser.add_argument( + "--match", + type=str, + help="Regex pattern for selecting testcases", + ) + parser.add_argument( + "--job-include", + nargs="+", + type=str, + help="List of tags to include in jobs", + ) + parser.add_argument( + "--job-reject", + nargs="+", + type=str, + help="List of tags to reject in jobs", + ) + parser.add_argument( + "--job-match", + type=str, + help="Regex pattern for selecting jobs", + ) args = parser.parse_args() csv_data = read_csv_files(args.root_dir) data = parse_csv_data(csv_data) - plot_data(data, args.output_filename) + plot_data(data, args) diff --git a/scripts/detect_regressions.py b/scripts/detect_regressions.py new file mode 100644 index 0000000000000000000000000000000000000000..4aa98941f630f814aab5bf87b055b1194b766e62 --- /dev/null +++ b/scripts/detect_regressions.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 + +import argparse +from pathlib import Path +import pandas as pd +import plotly.graph_objects as go +from plotly.subplots import make_subplots + + +def main(args): + + logs_dir = args.logs_dir + output_filename = args.output_filename + measure = args.measure + days = args.days + + input_path = Path(logs_dir) + logs = [f for f in input_path.iterdir() if f.is_dir()] + + # Build dict of scores + formatdict = {} + sha = {} + logdict = {} + for log in logs: + date = log.name + logdict[date] = {} + formatdict[date] = {} + for logfile in log.glob("*.csv"): + tmp = logfile.name.split("-") + job = "-".join(tmp[3:-4]) + sha[date] = tmp[-1].split(".")[0] + data = pd.read_csv(logfile, usecols=["testcase", measure, "format"]) + logdict[date][job] = {} + formatdict[date][job] = {} + + for testcase, value, format in zip( + data["testcase"], data[measure], data["format"] + ): + formatdict[date][job][testcase] = format + logdict[date][job][testcase] = value + + # Restructure dict + csv_rows = [] + formats = [] + for date, jobs in logdict.items(): + for job, testcases in jobs.items(): + for testcase, value in testcases.items(): + csv_rows.append( + (job, testcase, formatdict[date][job][testcase], date, value) + ) + + result = pd.DataFrame( + csv_rows, columns=["job", "testcase", "format", "date", "value"] + ) + result = result.pivot( + index=["job", "testcase", "format"], columns="date", values="value" + ).reset_index() + + # Keep only tests for which results exist in any of the days + if days == -1: + rng = result.columns[3:] # Whole history + else: + rng = result.columns[-days:] + result = result.dropna(subset=rng) + result = result.reset_index(drop=True) + + ratio = result.copy() + ratio = ratio.reset_index() + dates = result.iloc[:, 3:].columns + + # Calculate ratios + ratio[dates[0]] = 1.0 # Set first ratio to 1.0 + for prevdate, currdate in zip(dates[0:-1], dates[1:]): + ratio[currdate] = result[currdate] / result[prevdate] + + values = result.iloc[:, 3:] + date = values.columns + + formats = result["format"].dropna().unique().tolist() + + plotdata = pd.DataFrame(0.0, index=formats, columns=dates[-(days) : -1]) + plottext = pd.DataFrame("", index=formats, columns=dates[-(days) : -1]) + + all_indices = [] + + for i in range(days): + currdate = dates[-(days - i)] # Make robust for shorter history + prevdate = dates[-(days - i + 1)] + + idx = ratio.groupby("format")[currdate].idxmax() + all_indices.extend(idx.tolist()) + + # Store worst case per format for plotting + for f in formats: + plotdata.loc[f, currdate] = ratio.iloc[idx[f]][currdate] + plottext.loc[f, currdate] = ( + f"Job: {result.iloc[idx[f]]['job']}
Testcase: {result.iloc[idx[f]]['testcase']}
Max {measure} ratio: {ratio.iloc[idx[f]][currdate]:.2f}
Date: {currdate}" + ) + + fig = make_subplots( + rows=5, + cols=2, + subplot_titles=[f"{i}" for i in formats], + shared_xaxes="columns", + ) + + for i, fmt in enumerate(formats): + row = i // 2 + 1 + col = i % 2 + 1 + + fig.add_trace( + go.Scatter( + x=pd.to_datetime(plotdata.columns), + y=plotdata.loc[fmt], + mode="lines+markers", + name=f"Max {measure}", + hovertext=plottext.loc[fmt], + hoverinfo="text", + showlegend=False, + ), + row=row, + col=col, + ) + + fig.update_layout( + title_text=f"Regression detection: Max {measure} ratio", + legend=dict(x=1, y=1, orientation="v"), + hovermode="x unified", + ) + + fig.update_xaxes(automargin=True) + fig.update_yaxes(automargin=True) + + # Save to html + fig.write_html(output_filename) + + # Write CSV-file + if args.csv: + output = result.iloc[all_indices].copy() + cols = ["job","testcase","format"] + cols.extend(date[-days:].tolist()) + output = output.loc[:,cols] + values = output.iloc[:, 3:] + last_date = values.columns[-1] + output.insert(3, "min_date", values.idxmin(axis=1)) + output.insert(4, "min_sha", output["min_date"].map(sha)) + output.insert(5, "curr_value", output[last_date]) + output.insert(6, "min_value", values.min(axis=1)) + output.insert(7, "diff", output["curr_value"] - output["min_value"]) + output.insert(8, "ratio", output["curr_value"] / output["min_value"]) + output.loc[output["min_value"] == 0, "ratio"] = ( + 1 # Set ratio to 1 for denominator 0 + ) + output["min_sha"] = ( + "'" + output["min_sha"] + ) # Add apostrophy to prevent Excel reading this as a number + output.sort_values( + by=["format", "ratio"], ascending=[True, False], inplace=True + ) + output.to_csv(args.csv, sep=";", index=False) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="logs dir") + parser.add_argument( + "logs_dir", + type=str, + help="Logs dir, e.g. logs", + ) + parser.add_argument( + "output_filename", + type=str, + help="Output html file. e.g mld.html", + ) + parser.add_argument( + "--measure", + type=str, + help="Measure for summary, one of MLD MIN_SSNR MAX_ABS_DIFF MIN_ODG, (default: MLD)", + default="MLD", + ) + parser.add_argument( + "--days", + type=int, + help="Number of days in history, (default: whole history)", + default=-1, + ) + parser.add_argument( + "--csv", + type=str, + help="CSV output file", + ) + + args = parser.parse_args() + main(args)