dupecheck.py

changeset 3
569fa9a431b9
child 4
a7e9e7974c22
equal deleted inserted replaced
2:319f8c3fd394 3:569fa9a431b9
1 #!/usr/bin/env python
2
3 import difflib
4 import os, sys
5
6 BASEDIR="../DREAMBOX"
7
8 FILELIST=[]
9 DUPLICATES={}
10
11 def similarity(seq1, seq2):
12 #return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio()
13 return difflib.SequenceMatcher(a=seq1, b=seq2).ratio()
14
15 for root, subdirs, files in os.walk(BASEDIR):
16 for filename in files:
17 if filename.endswith(".ts"):
18 file_path = os.path.join(root, filename)
19 title = filename.split(" - ")
20 if len(title) == 1:
21 title = title[0]
22 else:
23 title = " - ".join(title[2:])
24 title = title[:-3].lower()
25 FILELIST.append([title, filename, root])
26 elif filename.endswith(".mkv"):
27 title = filename[:-4].lower()
28 FILELIST.append([title, filename, root])
29
30
31 for idx, item in enumerate(FILELIST):
32 comparelist = FILELIST[idx+1:]
33 #print "%d %s (%d to compare)" % (idx, item[0], len(comparelist))
34 for idx2, item2 in enumerate(comparelist):
35 if similarity(item[0], item2[0]) > 0.85:
36 #print "possible duplicate %d %s" % (idx2, item2[0])
37 key = os.path.join(item[2], item[1])
38 if not key in DUPLICATES.keys():
39 DUPLICATES[key] = []
40 DUPLICATES[key].append(
41 os.path.join(item2[2], item2[1]))
42
43 for base in DUPLICATES.keys():
44 print base
45 for dup in DUPLICATES[base]:
46 print dup
47 print ""

mercurial