Sat, 04 Nov 2017 22:52:01 +0100
prepare for speedup
#!/usr/bin/env python from __future__ import print_function import difflib import os, sys BASEDIR="../DREAMBOX" FILELIST=[] DUPLICATES={} def similarity(seq1, seq2): #return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio() return difflib.SequenceMatcher(a=seq1, b=seq2).ratio() print("Reading files...") for root, subdirs, files in os.walk(BASEDIR): for filename in files: if filename.endswith(".ts"): file_path = os.path.join(root, filename) title = filename.split(" - ") if len(title) == 1: title = title[0] else: title = " - ".join(title[2:]) title = title[:-3].lower() FILELIST.append([title, filename, root]) elif filename.endswith(".mkv"): title = filename[:-4].lower() FILELIST.append([title, filename, root]) print("%i files found, running duplicate testing loop" % len(FILELIST)) for idx, item in enumerate(FILELIST): comparelist = FILELIST[idx+1:] print("%d %s\033[K\r" % (idx, item[0]), end='') sys.stdout.flush() for idx2, item2 in enumerate(comparelist): if similarity(item[0], item2[0]) > 0.85: #print "possible duplicate %d %s" % (idx2, item2[0]) key = os.path.join(item[2], item[1]) if not key in DUPLICATES.keys(): DUPLICATES[key] = [] DUPLICATES[key].append( os.path.join(item2[2], item2[1])) print("\n\n\n") idx = 1 for base in DUPLICATES.keys(): print("Duplicate file set #%i" % idx) print(base) for dup in DUPLICATES[base]: print(dup) print() idx += 1