dupecheck.py

Sat, 04 Nov 2017 22:34:12 +0100

author
mdd
date
Sat, 04 Nov 2017 22:34:12 +0100
changeset 3
569fa9a431b9
child 4
a7e9e7974c22
permissions
-rwxr-xr-x

added filename duplicate checker

3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
1 #!/usr/bin/env python
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
2
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
3 import difflib
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
4 import os, sys
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
5
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
6 BASEDIR="../DREAMBOX"
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
7
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
8 FILELIST=[]
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
9 DUPLICATES={}
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
10
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
11 def similarity(seq1, seq2):
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
12 #return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio()
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
13 return difflib.SequenceMatcher(a=seq1, b=seq2).ratio()
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
14
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
15 for root, subdirs, files in os.walk(BASEDIR):
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
16 for filename in files:
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
17 if filename.endswith(".ts"):
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
18 file_path = os.path.join(root, filename)
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
19 title = filename.split(" - ")
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
20 if len(title) == 1:
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
21 title = title[0]
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
22 else:
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
23 title = " - ".join(title[2:])
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
24 title = title[:-3].lower()
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
25 FILELIST.append([title, filename, root])
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
26 elif filename.endswith(".mkv"):
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
27 title = filename[:-4].lower()
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
28 FILELIST.append([title, filename, root])
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
29
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
30
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
31 for idx, item in enumerate(FILELIST):
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
32 comparelist = FILELIST[idx+1:]
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
33 #print "%d %s (%d to compare)" % (idx, item[0], len(comparelist))
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
34 for idx2, item2 in enumerate(comparelist):
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
35 if similarity(item[0], item2[0]) > 0.85:
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
36 #print "possible duplicate %d %s" % (idx2, item2[0])
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
37 key = os.path.join(item[2], item[1])
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
38 if not key in DUPLICATES.keys():
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
39 DUPLICATES[key] = []
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
40 DUPLICATES[key].append(
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
41 os.path.join(item2[2], item2[1]))
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
42
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
43 for base in DUPLICATES.keys():
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
44 print base
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
45 for dup in DUPLICATES[base]:
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
46 print dup
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
47 print ""

mercurial