little speed tuning done

2017-11-04

author
mdd
date
Sat, 04 Nov 2017 23:12:37 +0100 (2017-11-04)
changeset 5
51e57e9f8db1
parent 4
a7e9e7974c22
child 6
1420abafd049

little speed tuning done

dupecheck.py file | annotate | diff | comparison | revisions
--- a/dupecheck.py	Sat Nov 04 22:52:01 2017 +0100
+++ b/dupecheck.py	Sat Nov 04 23:12:37 2017 +0100
@@ -9,10 +9,6 @@
 FILELIST=[]
 DUPLICATES={}
 
-def similarity(seq1, seq2):
-    #return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio()
-    return difflib.SequenceMatcher(a=seq1, b=seq2).ratio()
-
 print("Reading files...")
 for root, subdirs, files in os.walk(BASEDIR):
     for filename in files:
@@ -30,20 +26,23 @@
             FILELIST.append([title, filename, root])
 print("%i files found, running duplicate testing loop" % len(FILELIST))
 
-
-for idx, item in enumerate(FILELIST):
-    comparelist = FILELIST[idx+1:]
-    print("%d %s\033[K\r" % (idx, item[0]),
+listlen = len(FILELIST)
+for idx in range(listlen):
+    if not FILELIST[idx]:
+        continue
+    print("\r%d %s\033[K" % (idx, FILELIST[idx][0]),
         end='')
     sys.stdout.flush()
-    for idx2, item2 in enumerate(comparelist):
-        if similarity(item[0], item2[0]) > 0.85:
+    for idx2 in range(idx + 1, listlen):
+        if FILELIST[idx2] and difflib.SequenceMatcher(a = FILELIST[idx][0], b = FILELIST[idx2][0]).ratio() > 0.85:
             #print "possible duplicate %d %s" % (idx2, item2[0])
-            key = os.path.join(item[2], item[1])
+            key = os.path.join(FILELIST[idx][2], FILELIST[idx][1])
             if not key in DUPLICATES.keys():
                 DUPLICATES[key] = []
             DUPLICATES[key].append(
-                os.path.join(item2[2], item2[1]))
+                os.path.join(FILELIST[idx2][2], FILELIST[idx2][1]))
+            # unset the found duplicate, so that this will not be scanned again
+            FILELIST[idx2] = None
 
 print("\n\n\n")
 idx = 1

mercurial