1 #!/usr/bin/env python |
1 #!/usr/bin/env python |
|
2 """ |
|
3 Toolkit / executable to scan for duplicate filenames in movie database |
|
4 |
|
5 2017 by mdd |
|
6 """ |
|
7 |
|
8 #pylint: disable=line-too-long |
|
9 #pylint: disable=invalid-name |
2 |
10 |
3 from __future__ import print_function |
11 from __future__ import print_function |
4 import difflib |
12 import difflib |
5 import os, sys |
13 import os, sys |
6 |
14 |
7 BASEDIR="../DREAMBOX" |
15 class dupechecker(object): |
|
16 """ |
|
17 Simple class to scan multiple directories recursive, |
|
18 build a list of movie filenames. |
|
19 analyze the list for duplicates and dump them |
|
20 """ |
|
21 def __init__(self): |
|
22 self.basedir = "" |
|
23 self.filelist = [] |
|
24 self.duplicates = {} |
|
25 self.ratio = 0.85 |
8 |
26 |
9 FILELIST=[] |
27 def reset(self): |
10 DUPLICATES={} |
28 self.filelist = [] |
|
29 self.duplicates = {} |
11 |
30 |
12 print("Reading files...") |
31 def scandir(self, basedir): |
13 for root, subdirs, files in os.walk(BASEDIR): |
32 """ |
14 for filename in files: |
33 Scan a base directory for movie files and add them to |
15 if filename.endswith(".ts"): |
34 the list for analyze |
16 file_path = os.path.join(root, filename) |
35 """ |
17 title = filename.split(" - ") |
36 self.basedir = basedir |
18 if len(title) == 1: |
37 print("Scanning directory: %s" % basedir) |
19 title = title[0] |
38 for root, subdirs, files in os.walk(basedir): |
20 else: |
39 for filename in files: |
21 title = " - ".join(title[2:]) |
40 ext = os.path.splitext(filename)[1].lower() |
22 title = title[:-3].lower() |
41 if ext == ".ts": |
23 FILELIST.append([title, filename, root]) |
42 #file_path = os.path.join(root, filename) |
24 elif filename.endswith(".mkv"): |
43 title = filename.split(" - ") |
25 title = filename[:-4].lower() |
44 if len(title) == 1: |
26 FILELIST.append([title, filename, root]) |
45 title = title[0] |
27 elif filename.endswith(".mp4"): |
46 else: |
28 title = filename[:-4].lower() |
47 title = " - ".join(title[2:]) |
29 FILELIST.append([title, filename, root]) |
48 title = title[:-3].lower() |
30 print("%i files found, running duplicate testing loop" % len(FILELIST)) |
49 self.filelist.append([title, filename, root]) |
|
50 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: |
|
51 title = filename[:-4].lower() |
|
52 self.filelist.append([title, filename, root]) |
31 |
53 |
32 listlen = len(FILELIST) |
54 def analyze(self): |
33 for idx in range(listlen): |
55 """ |
34 if not FILELIST[idx]: |
56 Analyze the scanlist for duplicates |
35 continue |
57 """ |
36 print("\r%d %s\033[K" % (idx, FILELIST[idx][0]), |
58 print("%i files to analyze, running duplicate testing loop..." % ( |
37 end='') |
59 len(self.filelist))) |
38 sys.stdout.flush() |
|
39 for idx2 in range(idx + 1, listlen): |
|
40 if FILELIST[idx2] and difflib.SequenceMatcher(a = FILELIST[idx][0], b = FILELIST[idx2][0]).ratio() > 0.85: |
|
41 #print "possible duplicate %d %s" % (idx2, item2[0]) |
|
42 key = os.path.join(FILELIST[idx][2], FILELIST[idx][1]) |
|
43 if not key in DUPLICATES.keys(): |
|
44 DUPLICATES[key] = [] |
|
45 DUPLICATES[key].append( |
|
46 os.path.join(FILELIST[idx2][2], FILELIST[idx2][1])) |
|
47 # unset the found duplicate, so that this will not be scanned again |
|
48 FILELIST[idx2] = None |
|
49 |
60 |
50 print("\n\n\n") |
61 listlen = len(self.filelist) |
51 idx = 1 |
62 for idx in range(listlen): |
52 for base in DUPLICATES.keys(): |
63 if not self.filelist[idx]: |
53 print("Duplicate file set #%i" % idx) |
64 continue |
54 print(base) |
65 print("\r%d %s\033[K" % ( |
55 for dup in DUPLICATES[base]: |
66 idx, self.filelist[idx][0]), end='') |
56 print(dup) |
67 sys.stdout.flush() |
57 print() |
68 for idx2 in range(idx + 1, listlen): |
58 idx += 1 |
69 if self.filelist[idx2]: |
|
70 if difflib.SequenceMatcher(a=self.filelist[idx][0], b=self.filelist[idx2][0]).ratio() > self.ratio: |
|
71 #print "possible duplicate %d %s" % (idx2, item2[0]) |
|
72 key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) |
|
73 if not key in self.duplicates.keys(): |
|
74 self.duplicates[key] = [] |
|
75 self.duplicates[key].append( |
|
76 os.path.join( |
|
77 self.filelist[idx2][2], |
|
78 self.filelist[idx2][1] |
|
79 )) |
|
80 # unset the found duplicate, so that this will not be scanned again |
|
81 self.filelist[idx2] = None |
|
82 print("\n\n\n") |
|
83 |
|
84 def output(self): |
|
85 """ |
|
86 Dump found duplicates to console |
|
87 """ |
|
88 idx = 1 |
|
89 for base in self.duplicates.keys(): |
|
90 print("Duplicate file set #%i" % idx) |
|
91 print(base) |
|
92 for dup in self.duplicates[base]: |
|
93 print(dup) |
|
94 print() |
|
95 idx += 1 |
|
96 |
|
97 |
|
98 if __name__ == "__main__": |
|
99 # parse command line options |
|
100 import argparse |
|
101 |
|
102 parser = argparse.ArgumentParser(\ |
|
103 description='Movie database filename duplicate checker') |
|
104 parser.add_argument('--ratio', type=float, default=0.85, \ |
|
105 help='filename duplicate threshold 0.1 < ratio 1.0') |
|
106 parser.add_argument('basedir', metavar='basedir', nargs='+', \ |
|
107 help='one or more base directories') |
|
108 |
|
109 args = parser.parse_args() |
|
110 dupe = dupechecker() |
|
111 dupe.ratio = args.ratio |
|
112 |
|
113 for srcstr in args.basedir: |
|
114 dupe.scandir(srcstr) |
|
115 dupe.analyze() |
|
116 dupe.output() |