Commit 119ef098 authored by Jan Kiene's avatar Jan Kiene
Browse files

add write out option for histograms to csv

parent 93d77e5f
Loading
Loading
Loading
Loading
Loading
+23 −3
Original line number Diff line number Diff line
@@ -37,17 +37,22 @@ def create_histograms(
    display_only: bool,
    bins_for_measures=BINS_FOR_MEASURES,
    prefix="",
    write_out_histograms=False,
):
    formats = df["format"].unique()
    categories = df["category"].unique()

    if not display_only:
    if not display_only or write_out_histograms:
        output_folder.mkdir(exist_ok=True, parents=True)

    for measure in measures:
        measure_in_df = prefix + measure
        bins = bins_for_measures.get(measure, get_bins_for_diff(df[measure_in_df]))
        x = [f"{x}" for x in bins] + ["", "ERROR"]

        df_hist = pd.DataFrame(columns=["format", "category"] + x)
        hist_row_count = 0

        for fmt in formats:
            fig, ax = plt.subplots()
            ax.xaxis.set_major_formatter("{x:.1f}")
@@ -57,9 +62,9 @@ def create_histograms(
                df_slice = df[data_mask]
                error_mask = df_slice["result"] == "ERROR"
                n_errors = np.sum(error_mask)
                df_hist = df_slice[np.logical_not(error_mask)]
                df_slice = df_slice[np.logical_not(error_mask)]

                counts, _ = np.histogram(df_hist[measure_in_df], bins)
                counts, _ = np.histogram(df_slice[measure_in_df], bins)

                data = np.concatenate([counts, [0], [n_errors], [0]])
                ax.bar(
@@ -74,6 +79,10 @@ def create_histograms(
                )
                bottom += data

                hist_row = [fmt, cat] + list(counts) + [0] + [0, n_errors]
                df_hist.loc[hist_row_count] = hist_row
                hist_row_count += 1

            # Histogram layout
            ax.set_title(fmt)
            ax.legend(loc="best")
@@ -94,6 +103,11 @@ def create_histograms(
                plt.savefig(image_path)
                plt.close(fig)

        if write_out_histograms:
            df_hist.to_csv(
                output_folder.joinpath(f"histogram_{measure}.csv"), index=False
            )

    if display_only:
        plt.show()

@@ -133,6 +147,11 @@ Use this for visualising diff scores.""",
        default="",
        help="Common suffix to use when collecting measures from the input csv file",
    )
    parser.add_argument(
        "--write-out-histograms",
        action="store_true",
        help="Write out the histogram values to csv",
    )
    args = parser.parse_args()
    df = pd.read_csv(args.csv_report)

@@ -152,4 +171,5 @@ Use this for visualising diff scores.""",
        args.display_only,
        bins_for_measures,
        args.prefix,
        args.write_out_histograms,
    )