From 52f27a7d64ddd6fc6a1eba295c212ab217cb6ec9 Mon Sep 17 00:00:00 2001 From: Charles Kinuthia Date: Mon, 28 Jul 2025 10:30:38 +0200 Subject: [PATCH 1/2] add script for processing long-term logs Usage: - python process_long_term_logs.py logs long_term_regression.html --- ci/process_long_term_logs.py | 147 +++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 ci/process_long_term_logs.py diff --git a/ci/process_long_term_logs.py b/ci/process_long_term_logs.py new file mode 100644 index 0000000000..9a221ee7ce --- /dev/null +++ b/ci/process_long_term_logs.py @@ -0,0 +1,147 @@ +import os +import pandas as pd +import argparse +import plotly.express as px +import plotly.graph_objects as go +from plotly.subplots import make_subplots + + +def read_csv_files(root_dir): + """ Read csv files as dictionary of panda dataframes.""" + csv_data = {} + for subdir, dirs, files in os.walk(root_dir): + for file in files: + if file.endswith('.csv'): + file_path = os.path.join(subdir, file) + try: + df = pd.read_csv(file_path) + csv_data[file_path] = df + except Exception as e: + print(f'Failed to read {file_path}: {e}') + exit(-1) + return csv_data + + +def parse_csv_data(csv_data): + """ keep 'testcase', 'format', 'MLD', 'MAX_ABS_DIFF' and add + 'date' column.""" + cols_to_keep = ["testcase", "format", "MLD", "MAX_ABS_DIFF"] + parsed_data = {} + for key, df in csv_data.items(): + cols = [ + col for col in cols_to_keep if col in df.columns] + date = os.path.basename(os.path.dirname(key)) + new_df = df[cols].copy() + new_df['date'] = date + parsed_data[key] = new_df + + # concatenate all dataframe in the dictionary + concat_df = pd.concat(parsed_data.values(), ignore_index=True) + return concat_df + + +def plot_data(df, output_filename): + """ plot max values for 'MLD' and 'MAX_ABS_DIFF' data and save + to html file. """ + # Convert 'date' to datetime + df['date'] = pd.to_datetime(df['date'], errors='coerce') + df['MLD'] = pd.to_numeric(df['MLD'], errors='coerce') + df['MAX_ABS_DIFF'] = pd.to_numeric(df['MAX_ABS_DIFF'], errors='coerce') + + # Drop rows with NaT and NaN + clean_df = df.dropna(subset=['date', 'MLD', 'MAX_ABS_DIFF']) + + # Group by 'format' and 'date' to get rows with max 'MLD' per group + max_mld = clean_df.groupby(['format', 'date']).apply( + lambda x: x.loc[x['MLD'].idxmax()]).reset_index(drop=True) + + # Group by 'format' and 'date' to get rows with max 'MAX_ABS_DIFF' per + # group + max_diff = clean_df.groupby(['format', 'date']).apply( + lambda x: x.loc[x['MAX_ABS_DIFF'].idxmax()]).reset_index(drop=True) + + formats = sorted(clean_df['format'].unique()) + + fig = make_subplots( + rows=5, cols=2, + specs=[[{"secondary_y": True}]*2]*5, + subplot_titles=[f'{i}' for i in formats], + shared_xaxes='columns' + ) + + for i, fmt in enumerate(formats): + row = i // 2 + 1 + col = i % 2 + 1 + + data_mld = max_mld[max_mld['format'] == fmt].sort_values('date') + data_diff = max_diff[max_diff['format'] == fmt].sort_values('date') + + # Add max 'MLD' to primary y-axis + fig.add_trace( + go.Scatter( + x=data_mld['date'], y=data_mld['MLD'], mode='lines+markers', + name=f' {fmt} - Max MLD', + hovertext=[f"Testcase: {tc}
MAX_ABS_DIFF: {diff}
Format:" + f" {format}
Date: {date.date()}" + for tc, diff, format, date in zip( + data_mld['testcase'], data_mld['MAX_ABS_DIFF'], + data_mld['format'], data_mld['date'])], + hoverinfo='text+y' + ), + row=row, col=col, secondary_y=False + ) + + # Add max 'MAX_ABS_DIFF' to secondary y-axis + fig.add_trace( + go.Scatter( + x=data_diff['date'], y=data_diff['MAX_ABS_DIFF'], + mode='lines+markers', + name=f'{fmt} - Max MAX_ABS_DIFF', + hovertext=[f"Testcase: {tc}
MLD: {mld:.4f}
Format:" + f" {format}
Date: {date.date()}" + for tc, mld, format, date in zip( + data_diff['testcase'], data_diff['MLD'], + data_mld['format'], data_diff['date'])], + hoverinfo='text+y' + ), + row=row, col=col, secondary_y=True + ) + + fig.update_layout( + title_text='Long-term regression: max MLD and max MAX_ABS_DIFF', + legend=dict(x=1, y=1, orientation='v'), + hovermode='x unified' + ) + + fig.update_xaxes(automargin=True) + fig.update_yaxes(automargin=True) + + # Update y-axes titles per subplot + for i in range(10): + yaxis_num = i*2 + 1 + yaxis2_num = yaxis_num + 1 + fig['layout'][f'yaxis{yaxis_num}'].update( + title='Max MLD', titlefont=dict(color='blue'), + tickfont=dict(color='blue')) + fig['layout'][f'yaxis{yaxis2_num}'].update( + title='Max MAX_ABS_DIFF', titlefont=dict(color='green'), + tickfont=dict(color='green')) + + # Save to html + fig.write_html(output_filename) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='Plot long term logs') + parser.add_argument( + 'root_dir', type=str, help='Root directory containing subdirectories' + ' with CSV log files') + parser.add_argument( + 'output_filename', type=str, help='Filename of the generated plot. e.g' + '. long_term_regression.html') + args = parser.parse_args() + + csv_data = read_csv_files(args.root_dir) + data = parse_csv_data(csv_data) + plot_data(data, args.output_filename) -- GitLab From f43dfa50e47e8f879746873c39260263b81c5b9b Mon Sep 17 00:00:00 2001 From: Charles Kinuthia Date: Tue, 5 Aug 2025 08:37:26 +0200 Subject: [PATCH 2/2] format code and add label to hover data --- ci/process_long_term_logs.py | 155 +++++++++++++++++++++-------------- 1 file changed, 94 insertions(+), 61 deletions(-) diff --git a/ci/process_long_term_logs.py b/ci/process_long_term_logs.py index 9a221ee7ce..baabc7d93a 100644 --- a/ci/process_long_term_logs.py +++ b/ci/process_long_term_logs.py @@ -7,32 +7,31 @@ from plotly.subplots import make_subplots def read_csv_files(root_dir): - """ Read csv files as dictionary of panda dataframes.""" + """Read csv files as dictionary of panda dataframes.""" csv_data = {} for subdir, dirs, files in os.walk(root_dir): for file in files: - if file.endswith('.csv'): + if file.endswith(".csv"): file_path = os.path.join(subdir, file) try: df = pd.read_csv(file_path) csv_data[file_path] = df except Exception as e: - print(f'Failed to read {file_path}: {e}') + print(f"Failed to read {file_path}: {e}") exit(-1) return csv_data def parse_csv_data(csv_data): - """ keep 'testcase', 'format', 'MLD', 'MAX_ABS_DIFF' and add - 'date' column.""" + """keep 'testcase', 'format', 'MLD', 'MAX_ABS_DIFF' and add + 'date' column.""" cols_to_keep = ["testcase", "format", "MLD", "MAX_ABS_DIFF"] parsed_data = {} for key, df in csv_data.items(): - cols = [ - col for col in cols_to_keep if col in df.columns] + cols = [col for col in cols_to_keep if col in df.columns] date = os.path.basename(os.path.dirname(key)) new_df = df[cols].copy() - new_df['date'] = date + new_df["date"] = date parsed_data[key] = new_df # concatenate all dataframe in the dictionary @@ -41,76 +40,105 @@ def parse_csv_data(csv_data): def plot_data(df, output_filename): - """ plot max values for 'MLD' and 'MAX_ABS_DIFF' data and save - to html file. """ + """plot max values for 'MLD' and 'MAX_ABS_DIFF' data and save + to html file.""" # Convert 'date' to datetime - df['date'] = pd.to_datetime(df['date'], errors='coerce') - df['MLD'] = pd.to_numeric(df['MLD'], errors='coerce') - df['MAX_ABS_DIFF'] = pd.to_numeric(df['MAX_ABS_DIFF'], errors='coerce') + df["date"] = pd.to_datetime(df["date"], errors="coerce") + df["MLD"] = pd.to_numeric(df["MLD"], errors="coerce") + df["MAX_ABS_DIFF"] = pd.to_numeric(df["MAX_ABS_DIFF"], errors="coerce") # Drop rows with NaT and NaN - clean_df = df.dropna(subset=['date', 'MLD', 'MAX_ABS_DIFF']) + clean_df = df.dropna(subset=["date", "MLD", "MAX_ABS_DIFF"]) # Group by 'format' and 'date' to get rows with max 'MLD' per group - max_mld = clean_df.groupby(['format', 'date']).apply( - lambda x: x.loc[x['MLD'].idxmax()]).reset_index(drop=True) + max_mld = ( + clean_df.groupby(["format", "date"]) + .apply(lambda x: x.loc[x["MLD"].idxmax()]) + .reset_index(drop=True) + ) # Group by 'format' and 'date' to get rows with max 'MAX_ABS_DIFF' per # group - max_diff = clean_df.groupby(['format', 'date']).apply( - lambda x: x.loc[x['MAX_ABS_DIFF'].idxmax()]).reset_index(drop=True) + max_abs_diff = ( + clean_df.groupby(["format", "date"]) + .apply(lambda x: x.loc[x["MAX_ABS_DIFF"].idxmax()]) + .reset_index(drop=True) + ) - formats = sorted(clean_df['format'].unique()) + formats = sorted(clean_df["format"].unique()) fig = make_subplots( - rows=5, cols=2, - specs=[[{"secondary_y": True}]*2]*5, - subplot_titles=[f'{i}' for i in formats], - shared_xaxes='columns' + rows=5, + cols=2, + specs=[[{"secondary_y": True}] * 2] * 5, + subplot_titles=[f"{i}" for i in formats], + shared_xaxes="columns", ) for i, fmt in enumerate(formats): row = i // 2 + 1 col = i % 2 + 1 - data_mld = max_mld[max_mld['format'] == fmt].sort_values('date') - data_diff = max_diff[max_diff['format'] == fmt].sort_values('date') + data_mld = max_mld[max_mld["format"] == fmt].sort_values("date") + data_diff = max_abs_diff[max_abs_diff["format"] + == fmt].sort_values("date") # Add max 'MLD' to primary y-axis fig.add_trace( go.Scatter( - x=data_mld['date'], y=data_mld['MLD'], mode='lines+markers', - name=f' {fmt} - Max MLD', - hovertext=[f"Testcase: {tc}
MAX_ABS_DIFF: {diff}
Format:" - f" {format}
Date: {date.date()}" - for tc, diff, format, date in zip( - data_mld['testcase'], data_mld['MAX_ABS_DIFF'], - data_mld['format'], data_mld['date'])], - hoverinfo='text+y' + x=data_mld["date"], + y=data_mld["MLD"], + mode="lines+markers", + name=f" {fmt} - Max MLD", + hovertext=[ + f"Testcase: {tc}
MLD: {mld:.4f}
MAX_ABS_DIFF:" + f"{abs_diff}
Format:" + f" {format}
Date: {date.date()}" + for tc, mld, abs_diff, format, date in zip( + data_mld["testcase"], + data_mld["MLD"], + data_mld["MAX_ABS_DIFF"], + data_mld["format"], + data_mld["date"], + ) + ], + hoverinfo="text", ), - row=row, col=col, secondary_y=False + row=row, + col=col, + secondary_y=False, ) # Add max 'MAX_ABS_DIFF' to secondary y-axis fig.add_trace( go.Scatter( - x=data_diff['date'], y=data_diff['MAX_ABS_DIFF'], - mode='lines+markers', - name=f'{fmt} - Max MAX_ABS_DIFF', - hovertext=[f"Testcase: {tc}
MLD: {mld:.4f}
Format:" - f" {format}
Date: {date.date()}" - for tc, mld, format, date in zip( - data_diff['testcase'], data_diff['MLD'], - data_mld['format'], data_diff['date'])], - hoverinfo='text+y' + x=data_diff["date"], + y=data_diff["MAX_ABS_DIFF"], + mode="lines+markers", + name=f"{fmt} - Max MAX_ABS_DIFF", + hovertext=[ + f"Testcase: {tc}
MLD: {mld:.4f}
MAX_ABS_DIFF:" + f" {abs_diff:.4f}
Format:" + f" {format}
Date: {date.date()}" + for tc, mld, abs_diff, format, date in zip( + data_diff["testcase"], + data_diff["MLD"], + data_diff["MAX_ABS_DIFF"], + data_diff["format"], + data_diff["date"], + ) + ], + hoverinfo="text", ), - row=row, col=col, secondary_y=True + row=row, + col=col, + secondary_y=True, ) fig.update_layout( - title_text='Long-term regression: max MLD and max MAX_ABS_DIFF', - legend=dict(x=1, y=1, orientation='v'), - hovermode='x unified' + title_text="Long-term regression: max MLD and max MAX_ABS_DIFF", + legend=dict(x=1, y=1, orientation="v"), + hovermode="x unified", ) fig.update_xaxes(automargin=True) @@ -118,28 +146,33 @@ def plot_data(df, output_filename): # Update y-axes titles per subplot for i in range(10): - yaxis_num = i*2 + 1 + yaxis_num = i * 2 + 1 yaxis2_num = yaxis_num + 1 - fig['layout'][f'yaxis{yaxis_num}'].update( - title='Max MLD', titlefont=dict(color='blue'), - tickfont=dict(color='blue')) - fig['layout'][f'yaxis{yaxis2_num}'].update( - title='Max MAX_ABS_DIFF', titlefont=dict(color='green'), - tickfont=dict(color='green')) + fig["layout"][f"yaxis{yaxis_num}"].update( + title="Max MLD", titlefont=dict(color="blue"), tickfont=dict(color="blue") + ) + fig["layout"][f"yaxis{yaxis2_num}"].update( + title="Max MAX_ABS_DIFF", + titlefont=dict(color="green"), + tickfont=dict(color="green"), + ) # Save to html fig.write_html(output_filename) -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description='Plot long term logs') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Plot long term logs") parser.add_argument( - 'root_dir', type=str, help='Root directory containing subdirectories' - ' with CSV log files') + "root_dir", + type=str, + help="Root directory containing subdirectories" " with CSV log files", + ) parser.add_argument( - 'output_filename', type=str, help='Filename of the generated plot. e.g' - '. long_term_regression.html') + "output_filename", + type=str, + help="Filename of the generated plot. e.g" ". long_term_regression.html", + ) args = parser.parse_args() csv_data = read_csv_files(args.root_dir) -- GitLab