diff --git a/ci/process_long_term_logs.py b/ci/process_long_term_logs.py
index baabc7d93a1c28bf0804b9a939a6968810675fb7..1eec7fff35d0c9810b1f1d5f332c04f3b364900e 100644
--- a/ci/process_long_term_logs.py
+++ b/ci/process_long_term_logs.py
@@ -1,7 +1,9 @@
+#!/usr/bin/env python3
+
import os
import pandas as pd
import argparse
-import plotly.express as px
+import re
import plotly.graph_objects as go
from plotly.subplots import make_subplots
@@ -23,15 +25,18 @@ def read_csv_files(root_dir):
def parse_csv_data(csv_data):
- """keep 'testcase', 'format', 'MLD', 'MAX_ABS_DIFF' and add
- 'date' column."""
- cols_to_keep = ["testcase", "format", "MLD", "MAX_ABS_DIFF"]
+ """keep 'testcase', 'format', 'MLD', 'MAX_ABS_DIFF', 'MIN_ODG', 'MIN_SSNR' and add
+ 'date' and 'job' column."""
+ cols_to_keep = ["testcase", "format", "MLD", "MAX_ABS_DIFF", "MIN_ODG", "MIN_SSNR"]
parsed_data = {}
for key, df in csv_data.items():
+ tmp = key.split("-")
+ job = "-".join(tmp[4:-4])
cols = [col for col in cols_to_keep if col in df.columns]
date = os.path.basename(os.path.dirname(key))
new_df = df[cols].copy()
new_df["date"] = date
+ new_df["job"] = job
parsed_data[key] = new_df
# concatenate all dataframe in the dictionary
@@ -39,38 +44,67 @@ def parse_csv_data(csv_data):
return concat_df
-def plot_data(df, output_filename):
- """plot max values for 'MLD' and 'MAX_ABS_DIFF' data and save
+def plot_data(df, args):
+ """plot max values for measure and data and save
to html file."""
+
+ measure = args.measure
+ days = args.days
+
# Convert 'date' to datetime
df["date"] = pd.to_datetime(df["date"], errors="coerce")
- df["MLD"] = pd.to_numeric(df["MLD"], errors="coerce")
- df["MAX_ABS_DIFF"] = pd.to_numeric(df["MAX_ABS_DIFF"], errors="coerce")
+ df[measure] = pd.to_numeric(df[measure], errors="coerce")
+
+ # Filter out rows older than "days"
+ cutoff = df["date"].max() - pd.Timedelta(days=days)
+ df = df[df["date"] > cutoff].reset_index(drop=True)
# Drop rows with NaT and NaN
- clean_df = df.dropna(subset=["date", "MLD", "MAX_ABS_DIFF"])
+ df = df.dropna(subset=["date", measure])
+
+ # Filter test cases based on include/reject/match arguments
+ if args.include:
+ mask = pd.Series(False, index=df.index)
+ for tag in args.include:
+ mask |= df["testcase"].str.contains(tag, case=False, na=False)
+ df = df[mask]
+ if args.reject:
+ mask = pd.Series(False, index=df.index)
+ for tag in args.reject:
+ mask |= df["testcase"].str.contains(tag, case=False, na=False)
+ df = df[~mask]
+ if args.match:
+ pattern = re.compile(args.match, re.IGNORECASE)
+ df = df[df["testcase"].str.contains(pattern, na=False)]
+
+ # Filter jobs based on job-include/job-reject/job-match arguments
+ if args.job_include:
+ mask = pd.Series(False, index=df.index)
+ for tag in args.job_include:
+ mask |= df["job"].str.contains(tag, case=False, na=False)
+ df = df[mask]
+ if args.job_reject:
+ mask = pd.Series(False, index=df.index)
+ for tag in args.job_reject:
+ mask |= df["job"].str.contains(tag, case=False, na=False)
+ df = df[~mask]
+ if args.job_match:
+ pattern = re.compile(args.job_match, re.IGNORECASE)
+ df = df[df["job"].str.contains(pattern, na=False)]
- # Group by 'format' and 'date' to get rows with max 'MLD' per group
- max_mld = (
- clean_df.groupby(["format", "date"])
- .apply(lambda x: x.loc[x["MLD"].idxmax()])
- .reset_index(drop=True)
- )
- # Group by 'format' and 'date' to get rows with max 'MAX_ABS_DIFF' per
- # group
- max_abs_diff = (
- clean_df.groupby(["format", "date"])
- .apply(lambda x: x.loc[x["MAX_ABS_DIFF"].idxmax()])
- .reset_index(drop=True)
- )
+ # Group by 'format' and 'date' to get rows with max 'MLD' per group
+ idx = df.groupby(["format", "date"])[measure].idxmax()
+ max = df.loc[idx].reset_index(drop=True)
+ idx = df.groupby(["format", "date"])[measure].idxmin()
+ min = df.loc[idx].reset_index(drop=True)
+ mean = df.groupby(["format", "date"])[measure].mean().to_frame("mean").reset_index()
- formats = sorted(clean_df["format"].unique())
+ formats = sorted(df["format"].unique())
fig = make_subplots(
rows=5,
cols=2,
- specs=[[{"secondary_y": True}] * 2] * 5,
subplot_titles=[f"{i}" for i in formats],
shared_xaxes="columns",
)
@@ -79,64 +113,65 @@ def plot_data(df, output_filename):
row = i // 2 + 1
col = i % 2 + 1
- data_mld = max_mld[max_mld["format"] == fmt].sort_values("date")
- data_diff = max_abs_diff[max_abs_diff["format"]
- == fmt].sort_values("date")
+ if "MIN" in measure:
+ data = min[min["format"] == fmt].sort_values("date")
+ maxmin_str = "Min"
+ else:
+ data = max[max["format"] == fmt].sort_values("date")
+ maxmin_str = "Max"
- # Add max 'MLD' to primary y-axis
+ # Add max measure to plots
fig.add_trace(
go.Scatter(
- x=data_mld["date"],
- y=data_mld["MLD"],
+ x=data["date"],
+ y=data[measure],
mode="lines+markers",
- name=f" {fmt} - Max MLD",
+ name=f"{maxmin_str} {measure}",
hovertext=[
- f"Testcase: {tc}
MLD: {mld:.4f}
MAX_ABS_DIFF:"
- f"{abs_diff}
Format:"
- f" {format}
Date: {date.date()}"
- for tc, mld, abs_diff, format, date in zip(
- data_mld["testcase"],
- data_mld["MLD"],
- data_mld["MAX_ABS_DIFF"],
- data_mld["format"],
- data_mld["date"],
+ f"Testcase: {tc}
{maxmin_str} {measure}: {value:.4f}"
+ f"
Job: {job}"
+ f"
Date: {date.date()}"
+ for job, tc, value, date in zip(
+ data["job"],
+ data["testcase"],
+ data[measure],
+ data["date"],
)
],
hoverinfo="text",
+ marker_color="red",
+ showlegend=(i == 0),
),
row=row,
col=col,
- secondary_y=False,
)
- # Add max 'MAX_ABS_DIFF' to secondary y-axis
+ data = mean[mean["format"] == fmt].sort_values("date")
+
+ # Add mean measure to plots
fig.add_trace(
go.Scatter(
- x=data_diff["date"],
- y=data_diff["MAX_ABS_DIFF"],
+ x=data["date"],
+ y=data["mean"],
mode="lines+markers",
- name=f"{fmt} - Max MAX_ABS_DIFF",
+ name=f"Mean {measure}",
hovertext=[
- f"Testcase: {tc}
MLD: {mld:.4f}
MAX_ABS_DIFF:"
- f" {abs_diff:.4f}
Format:"
- f" {format}
Date: {date.date()}"
- for tc, mld, abs_diff, format, date in zip(
- data_diff["testcase"],
- data_diff["MLD"],
- data_diff["MAX_ABS_DIFF"],
- data_diff["format"],
- data_diff["date"],
+ f"Mean {measure}: {value:.4f}" f"
Date: {date.date()}"
+ for value, date in zip(
+ data["mean"],
+ data["date"],
)
],
hoverinfo="text",
+ marker_color="blue",
+ showlegend=(i == 0),
),
row=row,
col=col,
- secondary_y=True,
)
fig.update_layout(
- title_text="Long-term regression: max MLD and max MAX_ABS_DIFF",
+ title_text=f"History: {measure}",
legend=dict(x=1, y=1, orientation="v"),
hovermode="x unified",
)
@@ -144,21 +179,8 @@ def plot_data(df, output_filename):
fig.update_xaxes(automargin=True)
fig.update_yaxes(automargin=True)
- # Update y-axes titles per subplot
- for i in range(10):
- yaxis_num = i * 2 + 1
- yaxis2_num = yaxis_num + 1
- fig["layout"][f"yaxis{yaxis_num}"].update(
- title="Max MLD", titlefont=dict(color="blue"), tickfont=dict(color="blue")
- )
- fig["layout"][f"yaxis{yaxis2_num}"].update(
- title="Max MAX_ABS_DIFF",
- titlefont=dict(color="green"),
- tickfont=dict(color="green"),
- )
-
# Save to html
- fig.write_html(output_filename)
+ fig.write_html(args.output_filename)
if __name__ == "__main__":
@@ -173,8 +195,54 @@ if __name__ == "__main__":
type=str,
help="Filename of the generated plot. e.g" ". long_term_regression.html",
)
+ parser.add_argument(
+ "--days",
+ type=int,
+ help="Number of days in history. Default: 30",
+ default=30,
+ )
+ parser.add_argument(
+ "--measure",
+ type=str,
+ help="Measure for analysis: MLD, MAX_ABS_DIFF, MIN_ODG, MIN_SSNR, default: MLD",
+ default="MLD",
+ )
+ parser.add_argument(
+ "--include",
+ nargs="+",
+ type=str,
+ help="List of tags to include in testcases",
+ )
+ parser.add_argument(
+ "--reject",
+ nargs="+",
+ type=str,
+ help="List of tags to reject in testcases",
+ )
+ parser.add_argument(
+ "--match",
+ type=str,
+ help="Regex pattern for selecting testcases",
+ )
+ parser.add_argument(
+ "--job-include",
+ nargs="+",
+ type=str,
+ help="List of tags to include in jobs",
+ )
+ parser.add_argument(
+ "--job-reject",
+ nargs="+",
+ type=str,
+ help="List of tags to reject in jobs",
+ )
+ parser.add_argument(
+ "--job-match",
+ type=str,
+ help="Regex pattern for selecting jobs",
+ )
args = parser.parse_args()
csv_data = read_csv_files(args.root_dir)
data = parse_csv_data(csv_data)
- plot_data(data, args.output_filename)
+ plot_data(data, args)
diff --git a/scripts/detect_regressions.py b/scripts/detect_regressions.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aa98941f630f814aab5bf87b055b1194b766e62
--- /dev/null
+++ b/scripts/detect_regressions.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+
+import argparse
+from pathlib import Path
+import pandas as pd
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+
+
+def main(args):
+
+ logs_dir = args.logs_dir
+ output_filename = args.output_filename
+ measure = args.measure
+ days = args.days
+
+ input_path = Path(logs_dir)
+ logs = [f for f in input_path.iterdir() if f.is_dir()]
+
+ # Build dict of scores
+ formatdict = {}
+ sha = {}
+ logdict = {}
+ for log in logs:
+ date = log.name
+ logdict[date] = {}
+ formatdict[date] = {}
+ for logfile in log.glob("*.csv"):
+ tmp = logfile.name.split("-")
+ job = "-".join(tmp[3:-4])
+ sha[date] = tmp[-1].split(".")[0]
+ data = pd.read_csv(logfile, usecols=["testcase", measure, "format"])
+ logdict[date][job] = {}
+ formatdict[date][job] = {}
+
+ for testcase, value, format in zip(
+ data["testcase"], data[measure], data["format"]
+ ):
+ formatdict[date][job][testcase] = format
+ logdict[date][job][testcase] = value
+
+ # Restructure dict
+ csv_rows = []
+ formats = []
+ for date, jobs in logdict.items():
+ for job, testcases in jobs.items():
+ for testcase, value in testcases.items():
+ csv_rows.append(
+ (job, testcase, formatdict[date][job][testcase], date, value)
+ )
+
+ result = pd.DataFrame(
+ csv_rows, columns=["job", "testcase", "format", "date", "value"]
+ )
+ result = result.pivot(
+ index=["job", "testcase", "format"], columns="date", values="value"
+ ).reset_index()
+
+ # Keep only tests for which results exist in any of the days
+ if days == -1:
+ rng = result.columns[3:] # Whole history
+ else:
+ rng = result.columns[-days:]
+ result = result.dropna(subset=rng)
+ result = result.reset_index(drop=True)
+
+ ratio = result.copy()
+ ratio = ratio.reset_index()
+ dates = result.iloc[:, 3:].columns
+
+ # Calculate ratios
+ ratio[dates[0]] = 1.0 # Set first ratio to 1.0
+ for prevdate, currdate in zip(dates[0:-1], dates[1:]):
+ ratio[currdate] = result[currdate] / result[prevdate]
+
+ values = result.iloc[:, 3:]
+ date = values.columns
+
+ formats = result["format"].dropna().unique().tolist()
+
+ plotdata = pd.DataFrame(0.0, index=formats, columns=dates[-(days) : -1])
+ plottext = pd.DataFrame("", index=formats, columns=dates[-(days) : -1])
+
+ all_indices = []
+
+ for i in range(days):
+ currdate = dates[-(days - i)] # Make robust for shorter history
+ prevdate = dates[-(days - i + 1)]
+
+ idx = ratio.groupby("format")[currdate].idxmax()
+ all_indices.extend(idx.tolist())
+
+ # Store worst case per format for plotting
+ for f in formats:
+ plotdata.loc[f, currdate] = ratio.iloc[idx[f]][currdate]
+ plottext.loc[f, currdate] = (
+ f"Job: {result.iloc[idx[f]]['job']}
Testcase: {result.iloc[idx[f]]['testcase']}
Max {measure} ratio: {ratio.iloc[idx[f]][currdate]:.2f}
Date: {currdate}"
+ )
+
+ fig = make_subplots(
+ rows=5,
+ cols=2,
+ subplot_titles=[f"{i}" for i in formats],
+ shared_xaxes="columns",
+ )
+
+ for i, fmt in enumerate(formats):
+ row = i // 2 + 1
+ col = i % 2 + 1
+
+ fig.add_trace(
+ go.Scatter(
+ x=pd.to_datetime(plotdata.columns),
+ y=plotdata.loc[fmt],
+ mode="lines+markers",
+ name=f"Max {measure}",
+ hovertext=plottext.loc[fmt],
+ hoverinfo="text",
+ showlegend=False,
+ ),
+ row=row,
+ col=col,
+ )
+
+ fig.update_layout(
+ title_text=f"Regression detection: Max {measure} ratio",
+ legend=dict(x=1, y=1, orientation="v"),
+ hovermode="x unified",
+ )
+
+ fig.update_xaxes(automargin=True)
+ fig.update_yaxes(automargin=True)
+
+ # Save to html
+ fig.write_html(output_filename)
+
+ # Write CSV-file
+ if args.csv:
+ output = result.iloc[all_indices].copy()
+ cols = ["job","testcase","format"]
+ cols.extend(date[-days:].tolist())
+ output = output.loc[:,cols]
+ values = output.iloc[:, 3:]
+ last_date = values.columns[-1]
+ output.insert(3, "min_date", values.idxmin(axis=1))
+ output.insert(4, "min_sha", output["min_date"].map(sha))
+ output.insert(5, "curr_value", output[last_date])
+ output.insert(6, "min_value", values.min(axis=1))
+ output.insert(7, "diff", output["curr_value"] - output["min_value"])
+ output.insert(8, "ratio", output["curr_value"] / output["min_value"])
+ output.loc[output["min_value"] == 0, "ratio"] = (
+ 1 # Set ratio to 1 for denominator 0
+ )
+ output["min_sha"] = (
+ "'" + output["min_sha"]
+ ) # Add apostrophy to prevent Excel reading this as a number
+ output.sort_values(
+ by=["format", "ratio"], ascending=[True, False], inplace=True
+ )
+ output.to_csv(args.csv, sep=";", index=False)
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="logs dir")
+ parser.add_argument(
+ "logs_dir",
+ type=str,
+ help="Logs dir, e.g. logs",
+ )
+ parser.add_argument(
+ "output_filename",
+ type=str,
+ help="Output html file. e.g mld.html",
+ )
+ parser.add_argument(
+ "--measure",
+ type=str,
+ help="Measure for summary, one of MLD MIN_SSNR MAX_ABS_DIFF MIN_ODG, (default: MLD)",
+ default="MLD",
+ )
+ parser.add_argument(
+ "--days",
+ type=int,
+ help="Number of days in history, (default: whole history)",
+ default=-1,
+ )
+ parser.add_argument(
+ "--csv",
+ type=str,
+ help="CSV output file",
+ )
+
+ args = parser.parse_args()
+ main(args)