Loading scripts/find_regressions_from_logs2.py 0 → 100644 +189 −0 Original line number Diff line number Diff line #!/usr/bin/env python3 import argparse from pathlib import Path import pandas as pd import plotly.graph_objects as go from plotly.subplots import make_subplots def main(logs_dir, output_filename, measure, days, all_results, diff_thr, ratio_thr, curr_value_thr): input_path = Path(logs_dir) logs = [f for f in input_path.iterdir() if f.is_dir()] # Build dict of scores formatdict = {} sha = {} logdict = {} for log in logs: date = log.name logdict[date] = {} formatdict[date] = {} for logfile in log.glob("*.csv"): tmp = logfile.name.split("-") job = "-".join(tmp[3:-4]) sha[date] = tmp[-1].split(".")[0] data = pd.read_csv(logfile, usecols=["testcase", measure, "format"]) logdict[date][job] = {} formatdict[date][job] = {} for testcase, value, format in zip( data["testcase"], data[measure], data["format"] ): formatdict[date][job][testcase] = format logdict[date][job][testcase] = value # Restructure dict csv_rows = [] formats = [] for date, jobs in logdict.items(): for job, testcases in jobs.items(): for testcase, value in testcases.items(): csv_rows.append((job, testcase, date, value)) formats.append((job, testcase, date, formatdict[date][job][testcase])) result = pd.DataFrame(csv_rows, columns=["job", "testcase", "date", "value"]) result = result.pivot( index=["job", "testcase"], columns="date", values="value" ).reset_index() f = pd.DataFrame(formats, columns=["job", "testcase", "date", "format"]) f = f.pivot( index=["job", "testcase"], columns="date", values="format" ).reset_index() ratio = result.copy() dates = result.iloc[:, 2:].columns # Calculate ratios ratio[dates[0]] = 1 # Set first ratio to 1 for prevdate, currdate in zip( dates[0:-1], dates[1:]): ratio[currdate] = result[currdate] / result[prevdate] values = result.iloc[:, 2:] date = values.columns last_date = date[-1] result.insert(2, "format", f[last_date]) ratio.insert(2, "format", f[last_date]) formats = result['format'].dropna().unique().tolist() plotdata = pd.DataFrame(0.0, index=formats, columns=dates[-(days+1):-1]) plottext = pd.DataFrame("", index=formats, columns=dates[-(days+1):-1]) for i in range(days): currdate = dates[-(days-i+1)] # Make robust for shorter history prevdate = dates[-(days-i+2)] idx = ratio.groupby("format")[currdate].nlargest(10).index.get_level_values(1) tmp = result[["job","testcase","format",prevdate,currdate]].iloc[idx,:].copy().reset_index() tmp.insert(3, "prev_date", prevdate) tmp.insert(4, "prev_sha", sha[prevdate]) tmp.insert(5, "curr_date", currdate) tmp.insert(6, "curr_sha", sha[prevdate]) tmp.insert(7, "diff", tmp[currdate] - tmp[prevdate]) tmp.insert(8, "ratio", tmp[currdate] / tmp[prevdate]) tmp.loc[tmp[prevdate] == 0, "ratio"] = ( 1 # Set ratio to 1 for denominator 0 ) tmp["prev_sha"] = "'" + tmp["prev_sha"] # Add apostrophy to prevent Excel reading this as a number tmp["curr_sha"] = "'" + tmp["curr_sha"] # Add apostrophy to prevent Excel reading this as a number csv_filename = f"regressions_{measure}_{currdate}.csv" tmp.to_csv(csv_filename, sep=";", index=False) # Store worst case per format for plotting idx = tmp.groupby("format")["ratio"].idxmax() for f in formats: plotdata.loc[f, currdate] = tmp.iloc[idx[f]]["ratio"] plottext.loc[f, currdate] = f"{tmp.iloc[idx[f]]['job']} - {tmp.iloc[idx[f]]['testcase']} - Max {measure} ratio: {tmp.iloc[idx[f]]['ratio']:.2f}" fig = make_subplots( rows=5, cols=2, subplot_titles=[f"{i}" for i in formats], shared_xaxes="columns", ) for i, fmt in enumerate(formats): row = i // 2 + 1 col = i % 2 + 1 fig.add_trace( go.Scatter( x=pd.to_datetime(plotdata.columns), y=plotdata.loc[fmt], mode="lines+markers", name=f"Max {measure}", hovertext=plottext.loc[fmt], hoverinfo="text", showlegend=False, ), row=row, col=col, ) fig.update_layout( title_text=f"Regression detection: Max {measure} ratio", legend=dict(x=1, y=1, orientation="v"), hovermode="x unified", ) fig.update_xaxes(automargin=True) fig.update_yaxes(automargin=True) # Save to html fig.write_html(output_filename) if __name__ == "__main__": parser = argparse.ArgumentParser(description="logs dir") parser.add_argument( "logs_dir", type=str, help="Logs dir, e.g. logs", ) parser.add_argument( "output_filename", type=str, help="Filename of the combined csv file. e.g mld.csv", ) parser.add_argument( "--measure", type=str, help="Measure for summary, one of MLD MIN_SSNR MAX_ABS_DIFF MIN_ODG, (default: MLD)", default="MLD", ) parser.add_argument( "--days", type=int, help="Number of days in history, (default: whole history)", default=-1, ) parser.add_argument( "--all_results", action="store_true", help="Output all results, including cases without regression (default: off)", default=False, ) parser.add_argument( "--diff_thr", type=float, help="Include test cases with diff above diff_thr, (default: 0.0)", default=0.0, ) parser.add_argument( "--ratio_thr", type=float, help="Include test cases with ratio above ratio_thr, (default: 1.0)", default=1.0, ) parser.add_argument( "--curr_value_thr", type=float, help="Include test cases with curr_value above curr_value_thr, (default: 0.0)", default=0.0, ) args = parser.parse_args() main(args.logs_dir, args.output_filename, args.measure, args.days, args.all_results, args.diff_thr, args.ratio_thr, args.curr_value_thr) Loading
scripts/find_regressions_from_logs2.py 0 → 100644 +189 −0 Original line number Diff line number Diff line #!/usr/bin/env python3 import argparse from pathlib import Path import pandas as pd import plotly.graph_objects as go from plotly.subplots import make_subplots def main(logs_dir, output_filename, measure, days, all_results, diff_thr, ratio_thr, curr_value_thr): input_path = Path(logs_dir) logs = [f for f in input_path.iterdir() if f.is_dir()] # Build dict of scores formatdict = {} sha = {} logdict = {} for log in logs: date = log.name logdict[date] = {} formatdict[date] = {} for logfile in log.glob("*.csv"): tmp = logfile.name.split("-") job = "-".join(tmp[3:-4]) sha[date] = tmp[-1].split(".")[0] data = pd.read_csv(logfile, usecols=["testcase", measure, "format"]) logdict[date][job] = {} formatdict[date][job] = {} for testcase, value, format in zip( data["testcase"], data[measure], data["format"] ): formatdict[date][job][testcase] = format logdict[date][job][testcase] = value # Restructure dict csv_rows = [] formats = [] for date, jobs in logdict.items(): for job, testcases in jobs.items(): for testcase, value in testcases.items(): csv_rows.append((job, testcase, date, value)) formats.append((job, testcase, date, formatdict[date][job][testcase])) result = pd.DataFrame(csv_rows, columns=["job", "testcase", "date", "value"]) result = result.pivot( index=["job", "testcase"], columns="date", values="value" ).reset_index() f = pd.DataFrame(formats, columns=["job", "testcase", "date", "format"]) f = f.pivot( index=["job", "testcase"], columns="date", values="format" ).reset_index() ratio = result.copy() dates = result.iloc[:, 2:].columns # Calculate ratios ratio[dates[0]] = 1 # Set first ratio to 1 for prevdate, currdate in zip( dates[0:-1], dates[1:]): ratio[currdate] = result[currdate] / result[prevdate] values = result.iloc[:, 2:] date = values.columns last_date = date[-1] result.insert(2, "format", f[last_date]) ratio.insert(2, "format", f[last_date]) formats = result['format'].dropna().unique().tolist() plotdata = pd.DataFrame(0.0, index=formats, columns=dates[-(days+1):-1]) plottext = pd.DataFrame("", index=formats, columns=dates[-(days+1):-1]) for i in range(days): currdate = dates[-(days-i+1)] # Make robust for shorter history prevdate = dates[-(days-i+2)] idx = ratio.groupby("format")[currdate].nlargest(10).index.get_level_values(1) tmp = result[["job","testcase","format",prevdate,currdate]].iloc[idx,:].copy().reset_index() tmp.insert(3, "prev_date", prevdate) tmp.insert(4, "prev_sha", sha[prevdate]) tmp.insert(5, "curr_date", currdate) tmp.insert(6, "curr_sha", sha[prevdate]) tmp.insert(7, "diff", tmp[currdate] - tmp[prevdate]) tmp.insert(8, "ratio", tmp[currdate] / tmp[prevdate]) tmp.loc[tmp[prevdate] == 0, "ratio"] = ( 1 # Set ratio to 1 for denominator 0 ) tmp["prev_sha"] = "'" + tmp["prev_sha"] # Add apostrophy to prevent Excel reading this as a number tmp["curr_sha"] = "'" + tmp["curr_sha"] # Add apostrophy to prevent Excel reading this as a number csv_filename = f"regressions_{measure}_{currdate}.csv" tmp.to_csv(csv_filename, sep=";", index=False) # Store worst case per format for plotting idx = tmp.groupby("format")["ratio"].idxmax() for f in formats: plotdata.loc[f, currdate] = tmp.iloc[idx[f]]["ratio"] plottext.loc[f, currdate] = f"{tmp.iloc[idx[f]]['job']} - {tmp.iloc[idx[f]]['testcase']} - Max {measure} ratio: {tmp.iloc[idx[f]]['ratio']:.2f}" fig = make_subplots( rows=5, cols=2, subplot_titles=[f"{i}" for i in formats], shared_xaxes="columns", ) for i, fmt in enumerate(formats): row = i // 2 + 1 col = i % 2 + 1 fig.add_trace( go.Scatter( x=pd.to_datetime(plotdata.columns), y=plotdata.loc[fmt], mode="lines+markers", name=f"Max {measure}", hovertext=plottext.loc[fmt], hoverinfo="text", showlegend=False, ), row=row, col=col, ) fig.update_layout( title_text=f"Regression detection: Max {measure} ratio", legend=dict(x=1, y=1, orientation="v"), hovermode="x unified", ) fig.update_xaxes(automargin=True) fig.update_yaxes(automargin=True) # Save to html fig.write_html(output_filename) if __name__ == "__main__": parser = argparse.ArgumentParser(description="logs dir") parser.add_argument( "logs_dir", type=str, help="Logs dir, e.g. logs", ) parser.add_argument( "output_filename", type=str, help="Filename of the combined csv file. e.g mld.csv", ) parser.add_argument( "--measure", type=str, help="Measure for summary, one of MLD MIN_SSNR MAX_ABS_DIFF MIN_ODG, (default: MLD)", default="MLD", ) parser.add_argument( "--days", type=int, help="Number of days in history, (default: whole history)", default=-1, ) parser.add_argument( "--all_results", action="store_true", help="Output all results, including cases without regression (default: off)", default=False, ) parser.add_argument( "--diff_thr", type=float, help="Include test cases with diff above diff_thr, (default: 0.0)", default=0.0, ) parser.add_argument( "--ratio_thr", type=float, help="Include test cases with ratio above ratio_thr, (default: 1.0)", default=1.0, ) parser.add_argument( "--curr_value_thr", type=float, help="Include test cases with curr_value above curr_value_thr, (default: 0.0)", default=0.0, ) args = parser.parse_args() main(args.logs_dir, args.output_filename, args.measure, args.days, args.all_results, args.diff_thr, args.ratio_thr, args.curr_value_thr)