Commit ecac007b authored by Jan Kiene's avatar Jan Kiene
Browse files

improve printout of duplicate hashes to make it more readable

parent 823a56f8
Loading
Loading
Loading
Loading
+19 −5
Original line number Diff line number Diff line
@@ -44,17 +44,31 @@ def get_hash_line_for_file(file: Path, output_dir: Path):
    return hashline


def get_duplicates(hashlines: list) -> dict:
    count = Counter([line.split()[-1] for line in hashlines])
    duplicates = {}
    for hash, count in count.items():
        if count == 1:
            continue

        files = [line.replace(hash, "").strip() for line in hashlines if hash in line]
        duplicates[hash] = files

    return duplicates


def main(output_dir, out_file):
    wav_files = sorted(output_dir.glob("*/**/*c[0-9][0-9].wav"))

    hashlines = [get_hash_line_for_file(f, output_dir) for f in wav_files]
    count = Counter([line.split()[-1] for line in hashlines])
    duplicates = [line for line in hashlines if count[line.split()[-1]] != 1]
    duplicates = get_duplicates(hashlines)

    if len(duplicates) != 0:
        print("Found duplicate hashes in these lines:")
        for dup in duplicates:
            print(dup)
        print(
            "Found duplicate hashes! The following hashes were found in multipe files:"
        )
        for hash, files in duplicates.items():
            print(f"{hash} - {', '.join(files)}")

    with open(out_file, "w") as f:
        f.writelines(hashlines)