# HG changeset patch # User mdd # Date 1509833557 -3600 # Node ID 51e57e9f8db172a6e5be857d3545a2cf57828f45 # Parent a7e9e7974c2236b654ae756370eaa44c913a7064 little speed tuning done diff -r a7e9e7974c22 -r 51e57e9f8db1 dupecheck.py --- a/dupecheck.py Sat Nov 04 22:52:01 2017 +0100 +++ b/dupecheck.py Sat Nov 04 23:12:37 2017 +0100 @@ -9,10 +9,6 @@ FILELIST=[] DUPLICATES={} -def similarity(seq1, seq2): - #return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio() - return difflib.SequenceMatcher(a=seq1, b=seq2).ratio() - print("Reading files...") for root, subdirs, files in os.walk(BASEDIR): for filename in files: @@ -30,20 +26,23 @@ FILELIST.append([title, filename, root]) print("%i files found, running duplicate testing loop" % len(FILELIST)) - -for idx, item in enumerate(FILELIST): - comparelist = FILELIST[idx+1:] - print("%d %s\033[K\r" % (idx, item[0]), +listlen = len(FILELIST) +for idx in range(listlen): + if not FILELIST[idx]: + continue + print("\r%d %s\033[K" % (idx, FILELIST[idx][0]), end='') sys.stdout.flush() - for idx2, item2 in enumerate(comparelist): - if similarity(item[0], item2[0]) > 0.85: + for idx2 in range(idx + 1, listlen): + if FILELIST[idx2] and difflib.SequenceMatcher(a = FILELIST[idx][0], b = FILELIST[idx2][0]).ratio() > 0.85: #print "possible duplicate %d %s" % (idx2, item2[0]) - key = os.path.join(item[2], item[1]) + key = os.path.join(FILELIST[idx][2], FILELIST[idx][1]) if not key in DUPLICATES.keys(): DUPLICATES[key] = [] DUPLICATES[key].append( - os.path.join(item2[2], item2[1])) + os.path.join(FILELIST[idx2][2], FILELIST[idx2][1])) + # unset the found duplicate, so that this will not be scanned again + FILELIST[idx2] = None print("\n\n\n") idx = 1