Commit 1b05e2cc authored by Jan Kiene's avatar Jan Kiene
Browse files

add check for duplicate hash values

parent 71d8323d
Loading
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@
import argparse
from pathlib import Path
from hashlib import md5
from collections import Counter


def get_hash_line_for_file(file: Path, experiment_dir: Path):
@@ -47,6 +48,13 @@ def main(experiment_dir, out_file):
    wav_files = sorted(experiment_dir.glob("proc_output*/**/*c??.wav"))

    hashlines = [get_hash_line_for_file(f, experiment_dir) for f in wav_files]
    count = Counter([line.split()[-1] for line in hashlines])
    duplicates = [line for line in hashlines if count[line.split()[-1]] != 1]

    if len(duplicates) != 0:
        print("Found duplicate hashes in these lines:")
        for dup in duplicates:
            print(dup)

    with open(out_file, "w") as f:
        f.writelines(hashlines)