dupecheck.py

changeset 21
1c0beeca2f9c
parent 15
82361ad7b3fe
child 22
c18abd9198c0
equal deleted inserted replaced
20:5b433bdd2023 21:1c0beeca2f9c
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """
3 Toolkit / executable to scan for duplicate filenames in movie database
4
5 2017 by mdd
6 """
7
8 #pylint: disable=line-too-long
9 #pylint: disable=invalid-name
2 10
3 from __future__ import print_function 11 from __future__ import print_function
4 import difflib 12 import difflib
5 import os, sys 13 import os, sys
6 14
7 BASEDIR="../DREAMBOX" 15 class dupechecker(object):
16 """
17 Simple class to scan multiple directories recursive,
18 build a list of movie filenames.
19 analyze the list for duplicates and dump them
20 """
21 def __init__(self):
22 self.basedir = ""
23 self.filelist = []
24 self.duplicates = {}
25 self.ratio = 0.85
8 26
9 FILELIST=[] 27 def reset(self):
10 DUPLICATES={} 28 self.filelist = []
29 self.duplicates = {}
11 30
12 print("Reading files...") 31 def scandir(self, basedir):
13 for root, subdirs, files in os.walk(BASEDIR): 32 """
14 for filename in files: 33 Scan a base directory for movie files and add them to
15 if filename.endswith(".ts"): 34 the list for analyze
16 file_path = os.path.join(root, filename) 35 """
17 title = filename.split(" - ") 36 self.basedir = basedir
18 if len(title) == 1: 37 print("Scanning directory: %s" % basedir)
19 title = title[0] 38 for root, subdirs, files in os.walk(basedir):
20 else: 39 for filename in files:
21 title = " - ".join(title[2:]) 40 ext = os.path.splitext(filename)[1].lower()
22 title = title[:-3].lower() 41 if ext == ".ts":
23 FILELIST.append([title, filename, root]) 42 #file_path = os.path.join(root, filename)
24 elif filename.endswith(".mkv"): 43 title = filename.split(" - ")
25 title = filename[:-4].lower() 44 if len(title) == 1:
26 FILELIST.append([title, filename, root]) 45 title = title[0]
27 elif filename.endswith(".mp4"): 46 else:
28 title = filename[:-4].lower() 47 title = " - ".join(title[2:])
29 FILELIST.append([title, filename, root]) 48 title = title[:-3].lower()
30 print("%i files found, running duplicate testing loop" % len(FILELIST)) 49 self.filelist.append([title, filename, root])
50 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
51 title = filename[:-4].lower()
52 self.filelist.append([title, filename, root])
31 53
32 listlen = len(FILELIST) 54 def analyze(self):
33 for idx in range(listlen): 55 """
34 if not FILELIST[idx]: 56 Analyze the scanlist for duplicates
35 continue 57 """
36 print("\r%d %s\033[K" % (idx, FILELIST[idx][0]), 58 print("%i files to analyze, running duplicate testing loop..." % (
37 end='') 59 len(self.filelist)))
38 sys.stdout.flush()
39 for idx2 in range(idx + 1, listlen):
40 if FILELIST[idx2] and difflib.SequenceMatcher(a = FILELIST[idx][0], b = FILELIST[idx2][0]).ratio() > 0.85:
41 #print "possible duplicate %d %s" % (idx2, item2[0])
42 key = os.path.join(FILELIST[idx][2], FILELIST[idx][1])
43 if not key in DUPLICATES.keys():
44 DUPLICATES[key] = []
45 DUPLICATES[key].append(
46 os.path.join(FILELIST[idx2][2], FILELIST[idx2][1]))
47 # unset the found duplicate, so that this will not be scanned again
48 FILELIST[idx2] = None
49 60
50 print("\n\n\n") 61 listlen = len(self.filelist)
51 idx = 1 62 for idx in range(listlen):
52 for base in DUPLICATES.keys(): 63 if not self.filelist[idx]:
53 print("Duplicate file set #%i" % idx) 64 continue
54 print(base) 65 print("\r%d %s\033[K" % (
55 for dup in DUPLICATES[base]: 66 idx, self.filelist[idx][0]), end='')
56 print(dup) 67 sys.stdout.flush()
57 print() 68 for idx2 in range(idx + 1, listlen):
58 idx += 1 69 if self.filelist[idx2]:
70 if difflib.SequenceMatcher(a=self.filelist[idx][0], b=self.filelist[idx2][0]).ratio() > self.ratio:
71 #print "possible duplicate %d %s" % (idx2, item2[0])
72 key = os.path.join(self.filelist[idx][2], self.filelist[idx][1])
73 if not key in self.duplicates.keys():
74 self.duplicates[key] = []
75 self.duplicates[key].append(
76 os.path.join(
77 self.filelist[idx2][2],
78 self.filelist[idx2][1]
79 ))
80 # unset the found duplicate, so that this will not be scanned again
81 self.filelist[idx2] = None
82 print("\n\n\n")
83
84 def output(self):
85 """
86 Dump found duplicates to console
87 """
88 idx = 1
89 for base in self.duplicates.keys():
90 print("Duplicate file set #%i" % idx)
91 print(base)
92 for dup in self.duplicates[base]:
93 print(dup)
94 print()
95 idx += 1
96
97
98 if __name__ == "__main__":
99 # parse command line options
100 import argparse
101
102 parser = argparse.ArgumentParser(\
103 description='Movie database filename duplicate checker')
104 parser.add_argument('--ratio', type=float, default=0.85, \
105 help='filename duplicate threshold 0.1 < ratio 1.0')
106 parser.add_argument('basedir', metavar='basedir', nargs='+', \
107 help='one or more base directories')
108
109 args = parser.parse_args()
110 dupe = dupechecker()
111 dupe.ratio = args.ratio
112
113 for srcstr in args.basedir:
114 dupe.scandir(srcstr)
115 dupe.analyze()
116 dupe.output()

mercurial