dupecheck.py

changeset 3
569fa9a431b9
child 4
a7e9e7974c22
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dupecheck.py	Sat Nov 04 22:34:12 2017 +0100
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+import difflib
+import os, sys
+
+BASEDIR="../DREAMBOX"
+
+FILELIST=[]
+DUPLICATES={}
+
+def similarity(seq1, seq2):
+    #return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio()
+    return difflib.SequenceMatcher(a=seq1, b=seq2).ratio()
+
+for root, subdirs, files in os.walk(BASEDIR):
+    for filename in files:
+        if filename.endswith(".ts"):
+            file_path = os.path.join(root, filename)
+            title = filename.split(" - ")
+            if len(title) == 1:
+                title = title[0]
+            else:
+                title = " - ".join(title[2:])
+            title = title[:-3].lower()
+            FILELIST.append([title, filename, root])
+        elif filename.endswith(".mkv"):
+            title = filename[:-4].lower()
+            FILELIST.append([title, filename, root])
+
+
+for idx, item in enumerate(FILELIST):
+    comparelist = FILELIST[idx+1:]
+    #print "%d %s (%d to compare)" % (idx, item[0], len(comparelist))
+    for idx2, item2 in enumerate(comparelist):
+        if similarity(item[0], item2[0]) > 0.85:
+            #print "possible duplicate %d %s" % (idx2, item2[0])
+            key = os.path.join(item[2], item[1])
+            if not key in DUPLICATES.keys():
+                DUPLICATES[key] = []
+            DUPLICATES[key].append(
+                os.path.join(item2[2], item2[1]))
+
+for base in DUPLICATES.keys():
+    print base
+    for dup in DUPLICATES[base]:
+        print dup
+    print ""
\ No newline at end of file

mercurial