Add scripts/find_regressions_from_logs2.py as alternative analysis script (4f6b4a0f) · Commits · IVAS Codec Public Collaboration / IVAS Codec

scripts/find_regressions_from_logs2.py

0 → 100644

+189 −0

Original line number	Diff line number	Diff line
		#!/usr/bin/env python3

		import argparse
		from pathlib import Path
		import pandas as pd
		import plotly.graph_objects as go
		from plotly.subplots import make_subplots

		def main(logs_dir, output_filename, measure, days, all_results, diff_thr, ratio_thr, curr_value_thr):

		input_path = Path(logs_dir)
		logs = [f for f in input_path.iterdir() if f.is_dir()]

		# Build dict of scores
		formatdict = {}
		sha = {}
		logdict = {}
		for log in logs:
		date = log.name
		logdict[date] = {}
		formatdict[date] = {}
		for logfile in log.glob("*.csv"):
		tmp = logfile.name.split("-")
		job = "-".join(tmp[3:-4])
		sha[date] = tmp[-1].split(".")[0]
		data = pd.read_csv(logfile, usecols=["testcase", measure, "format"])
		logdict[date][job] = {}
		formatdict[date][job] = {}

		for testcase, value, format in zip(
		data["testcase"], data[measure], data["format"]
		):
		formatdict[date][job][testcase] = format
		logdict[date][job][testcase] = value

		# Restructure dict
		csv_rows = []
		formats = []
		for date, jobs in logdict.items():
		for job, testcases in jobs.items():
		for testcase, value in testcases.items():
		csv_rows.append((job, testcase, date, value))
		formats.append((job, testcase, date, formatdict[date][job][testcase]))

		result = pd.DataFrame(csv_rows, columns=["job", "testcase", "date", "value"])
		result = result.pivot(
		index=["job", "testcase"], columns="date", values="value"
		).reset_index()

		f = pd.DataFrame(formats, columns=["job", "testcase", "date", "format"])
		f = f.pivot(
		index=["job", "testcase"], columns="date", values="format"
		).reset_index()

		ratio = result.copy()
		dates = result.iloc[:, 2:].columns

		# Calculate ratios
		ratio[dates[0]] = 1 # Set first ratio to 1
		for prevdate, currdate in zip( dates[0:-1], dates[1:]):
		ratio[currdate] = result[currdate] / result[prevdate]

		values = result.iloc[:, 2:]
		date = values.columns
		last_date = date[-1]
		result.insert(2, "format", f[last_date])
		ratio.insert(2, "format", f[last_date])

		formats = result['format'].dropna().unique().tolist()

		plotdata = pd.DataFrame(0.0, index=formats, columns=dates[-(days+1):-1])
		plottext = pd.DataFrame("", index=formats, columns=dates[-(days+1):-1])

		for i in range(days):
		currdate = dates[-(days-i+1)] # Make robust for shorter history
		prevdate = dates[-(days-i+2)]
		idx = ratio.groupby("format")[currdate].nlargest(10).index.get_level_values(1)
		tmp = result[["job","testcase","format",prevdate,currdate]].iloc[idx,:].copy().reset_index()
		tmp.insert(3, "prev_date", prevdate)
		tmp.insert(4, "prev_sha", sha[prevdate])
		tmp.insert(5, "curr_date", currdate)
		tmp.insert(6, "curr_sha", sha[prevdate])
		tmp.insert(7, "diff", tmp[currdate] - tmp[prevdate])
		tmp.insert(8, "ratio", tmp[currdate] / tmp[prevdate])
		tmp.loc[tmp[prevdate] == 0, "ratio"] = (
		1 # Set ratio to 1 for denominator 0
		)
		tmp["prev_sha"] = "'" + tmp["prev_sha"] # Add apostrophy to prevent Excel reading this as a number
		tmp["curr_sha"] = "'" + tmp["curr_sha"] # Add apostrophy to prevent Excel reading this as a number

		csv_filename = f"regressions_{measure}_{currdate}.csv"
		tmp.to_csv(csv_filename, sep=";", index=False)

		# Store worst case per format for plotting
		idx = tmp.groupby("format")["ratio"].idxmax()
		for f in formats:
		plotdata.loc[f, currdate] = tmp.iloc[idx[f]]["ratio"]
		plottext.loc[f, currdate] = f"{tmp.iloc[idx[f]]['job']} - {tmp.iloc[idx[f]]['testcase']} - Max {measure} ratio: {tmp.iloc[idx[f]]['ratio']:.2f}"

		fig = make_subplots(
		rows=5,
		cols=2,
		subplot_titles=[f"{i}" for i in formats],
		shared_xaxes="columns",
		)

		for i, fmt in enumerate(formats):
		row = i // 2 + 1
		col = i % 2 + 1

		fig.add_trace(
		go.Scatter(
		x=pd.to_datetime(plotdata.columns),
		y=plotdata.loc[fmt],
		mode="lines+markers",
		name=f"Max {measure}",
		hovertext=plottext.loc[fmt],
		hoverinfo="text",
		showlegend=False,
		),
		row=row,
		col=col,
		)

		fig.update_layout(
		title_text=f"Regression detection: Max {measure} ratio",
		legend=dict(x=1, y=1, orientation="v"),
		hovermode="x unified",
		)

		fig.update_xaxes(automargin=True)
		fig.update_yaxes(automargin=True)

		# Save to html
		fig.write_html(output_filename)

		if __name__ == "__main__":
		parser = argparse.ArgumentParser(description="logs dir")
		parser.add_argument(
		"logs_dir",
		type=str,
		help="Logs dir, e.g. logs",
		)
		parser.add_argument(
		"output_filename",
		type=str,
		help="Filename of the combined csv file. e.g mld.csv",
		)
		parser.add_argument(
		"--measure",
		type=str,
		help="Measure for summary, one of MLD MIN_SSNR MAX_ABS_DIFF MIN_ODG, (default: MLD)",
		default="MLD",
		)
		parser.add_argument(
		"--days",
		type=int,
		help="Number of days in history, (default: whole history)",
		default=-1,
		)
		parser.add_argument(
		"--all_results",
		action="store_true",
		help="Output all results, including cases without regression (default: off)",
		default=False,
		)
		parser.add_argument(
		"--diff_thr",
		type=float,
		help="Include test cases with diff above diff_thr, (default: 0.0)",
		default=0.0,
		)
		parser.add_argument(
		"--ratio_thr",
		type=float,
		help="Include test cases with ratio above ratio_thr, (default: 1.0)",
		default=1.0,
		)
		parser.add_argument(
		"--curr_value_thr",
		type=float,
		help="Include test cases with curr_value above curr_value_thr, (default: 0.0)",
		default=0.0,
		)

		args = parser.parse_args()

		main(args.logs_dir, args.output_filename, args.measure, args.days, args.all_results, args.diff_thr, args.ratio_thr, args.curr_value_thr)