Sat, 04 Nov 2017 23:12:37 +0100
little speed tuning done
#!/usr/bin/env python from __future__ import print_function import difflib import os, sys BASEDIR="../DREAMBOX" FILELIST=[] DUPLICATES={} print("Reading files...") for root, subdirs, files in os.walk(BASEDIR): for filename in files: if filename.endswith(".ts"): file_path = os.path.join(root, filename) title = filename.split(" - ") if len(title) == 1: title = title[0] else: title = " - ".join(title[2:]) title = title[:-3].lower() FILELIST.append([title, filename, root]) elif filename.endswith(".mkv"): title = filename[:-4].lower() FILELIST.append([title, filename, root]) print("%i files found, running duplicate testing loop" % len(FILELIST)) listlen = len(FILELIST) for idx in range(listlen): if not FILELIST[idx]: continue print("\r%d %s\033[K" % (idx, FILELIST[idx][0]), end='') sys.stdout.flush() for idx2 in range(idx + 1, listlen): if FILELIST[idx2] and difflib.SequenceMatcher(a = FILELIST[idx][0], b = FILELIST[idx2][0]).ratio() > 0.85: #print "possible duplicate %d %s" % (idx2, item2[0]) key = os.path.join(FILELIST[idx][2], FILELIST[idx][1]) if not key in DUPLICATES.keys(): DUPLICATES[key] = [] DUPLICATES[key].append( os.path.join(FILELIST[idx2][2], FILELIST[idx2][1])) # unset the found duplicate, so that this will not be scanned again FILELIST[idx2] = None print("\n\n\n") idx = 1 for base in DUPLICATES.keys(): print("Duplicate file set #%i" % idx) print(base) for dup in DUPLICATES[base]: print(dup) print() idx += 1