Commit 51e5a766 authored by kinuthia's avatar kinuthia Committed by norvell
Browse files

add script for processing long-term logs

Usage:
 -  python process_long_term_logs.py logs long_term_regression.html
parent af1d8e58
Loading
Loading
Loading
Loading
+147 −0
Original line number Diff line number Diff line
import os
import pandas as pd
import argparse
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


def read_csv_files(root_dir):
    """ Read csv files as dictionary of panda dataframes."""
    csv_data = {}
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(subdir, file)
                try:
                    df = pd.read_csv(file_path)
                    csv_data[file_path] = df
                except Exception as e:
                    print(f'Failed to read {file_path}: {e}')
                    exit(-1)
    return csv_data


def parse_csv_data(csv_data):
    """ keep 'testcase', 'format', 'MLD', 'MAX_ABS_DIFF'  and add
      'date' column."""
    cols_to_keep = ["testcase", "format", "MLD", "MAX_ABS_DIFF"]
    parsed_data = {}
    for key, df in csv_data.items():
        cols = [
            col for col in cols_to_keep if col in df.columns]
        date = os.path.basename(os.path.dirname(key))
        new_df = df[cols].copy()
        new_df['date'] = date
        parsed_data[key] = new_df

    # concatenate all dataframe in the dictionary
    concat_df = pd.concat(parsed_data.values(), ignore_index=True)
    return concat_df


def plot_data(df, output_filename):
    """ plot max values for 'MLD' and 'MAX_ABS_DIFF' data and save
    to html file. """
    # Convert 'date' to datetime
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df['MLD'] = pd.to_numeric(df['MLD'], errors='coerce')
    df['MAX_ABS_DIFF'] = pd.to_numeric(df['MAX_ABS_DIFF'], errors='coerce')

    # Drop rows with NaT and NaN
    clean_df = df.dropna(subset=['date', 'MLD', 'MAX_ABS_DIFF'])

    # Group by 'format' and 'date' to get rows with max 'MLD' per group
    max_mld = clean_df.groupby(['format', 'date']).apply(
        lambda x: x.loc[x['MLD'].idxmax()]).reset_index(drop=True)

    # Group by 'format' and 'date' to get rows with max 'MAX_ABS_DIFF' per
    # group
    max_diff = clean_df.groupby(['format', 'date']).apply(
        lambda x: x.loc[x['MAX_ABS_DIFF'].idxmax()]).reset_index(drop=True)

    formats = sorted(clean_df['format'].unique())

    fig = make_subplots(
        rows=5, cols=2,
        specs=[[{"secondary_y": True}]*2]*5,
        subplot_titles=[f'{i}' for i in formats],
        shared_xaxes='columns'
    )

    for i, fmt in enumerate(formats):
        row = i // 2 + 1
        col = i % 2 + 1

        data_mld = max_mld[max_mld['format'] == fmt].sort_values('date')
        data_diff = max_diff[max_diff['format'] == fmt].sort_values('date')

        # Add max 'MLD' to primary y-axis
        fig.add_trace(
            go.Scatter(
                x=data_mld['date'], y=data_mld['MLD'], mode='lines+markers',
                name=f' {fmt} - Max MLD',
                hovertext=[f"Testcase: {tc}<br>MAX_ABS_DIFF: {diff}<br>Format:"
                           f" {format}<br>Date: {date.date()}"
                           for tc, diff, format, date in zip(
                               data_mld['testcase'], data_mld['MAX_ABS_DIFF'],
                               data_mld['format'], data_mld['date'])],
                hoverinfo='text+y'
            ),
            row=row, col=col, secondary_y=False
        )

        # Add max 'MAX_ABS_DIFF' to secondary y-axis
        fig.add_trace(
            go.Scatter(
                x=data_diff['date'], y=data_diff['MAX_ABS_DIFF'],
                mode='lines+markers',
                name=f'{fmt} - Max MAX_ABS_DIFF',
                hovertext=[f"Testcase: {tc}<br>MLD: {mld:.4f}<br>Format:"
                           f" {format}<br>Date: {date.date()}"
                           for tc, mld, format, date in zip(
                               data_diff['testcase'], data_diff['MLD'],
                               data_mld['format'], data_diff['date'])],
                hoverinfo='text+y'
            ),
            row=row, col=col, secondary_y=True
        )

    fig.update_layout(
        title_text='Long-term regression: max MLD and max MAX_ABS_DIFF',
        legend=dict(x=1, y=1, orientation='v'),
        hovermode='x unified'
    )

    fig.update_xaxes(automargin=True)
    fig.update_yaxes(automargin=True)

    # Update y-axes titles per subplot
    for i in range(10):
        yaxis_num = i*2 + 1
        yaxis2_num = yaxis_num + 1
        fig['layout'][f'yaxis{yaxis_num}'].update(
            title='Max MLD', titlefont=dict(color='blue'),
            tickfont=dict(color='blue'))
        fig['layout'][f'yaxis{yaxis2_num}'].update(
            title='Max MAX_ABS_DIFF', titlefont=dict(color='green'),
            tickfont=dict(color='green'))

    # Save to html
    fig.write_html(output_filename)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Plot long term logs')
    parser.add_argument(
        'root_dir', type=str, help='Root directory containing subdirectories'
        ' with CSV log files')
    parser.add_argument(
        'output_filename', type=str, help='Filename of the generated plot. e.g'
        '. long_term_regression.html')
    args = parser.parse_args()

    csv_data = read_csv_files(args.root_dir)
    data = parse_csv_data(csv_data)
    plot_data(data, args.output_filename)