Thu, 04 Oct 2018 02:06:57 +0200
added ability to provide a list of title prefix strings to ignore in duplicate checking
3 | 1 | #!/usr/bin/env python |
33 | 2 | # -*- coding: utf-8 -*- |
21 | 3 | """ |
4 | Toolkit / executable to scan for duplicate filenames in movie database | |
5 | ||
6 | 2017 by mdd | |
7 | """ | |
8 | ||
9 | #pylint: disable=line-too-long | |
10 | #pylint: disable=invalid-name | |
3 | 11 | |
4 | 12 | from __future__ import print_function |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
13 | import os, sys, re |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
14 | |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
15 | RE_PARENTHESES = re.compile("[\(\[].*?[\)\]]") |
3 | 16 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
17 | def similarity(a, b): |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
18 | if DIFFLIB: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
19 | return difflib.SequenceMatcher(a=a, b=b).ratio() |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
20 | else: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
21 | return Levenshtein.ratio(a, b) |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
22 | |
32 | 23 | suffixes = ['b', 'K', 'M', 'G', 'T', 'P'] |
24 | def humansize(nbytes): | |
25 | i = 0 | |
26 | while nbytes >= 1024 and i < len(suffixes)-1: | |
27 | nbytes /= 1024. | |
28 | i += 1 | |
29 | f = ('%.2f' % nbytes).rstrip('0').rstrip('.') | |
30 | return '%s %s' % (f, suffixes[i]) | |
31 | ||
33 | 32 | def replace_all(text, dic): |
33 | for i, j in dic.iteritems(): | |
34 | text = text.replace(i, j) | |
35 | return text | |
36 | ||
21 | 37 | class dupechecker(object): |
38 | """ | |
39 | Simple class to scan multiple directories recursive, | |
40 | build a list of movie filenames. | |
41 | analyze the list for duplicates and dump them | |
42 | """ | |
43 | def __init__(self): | |
44 | self.basedir = "" | |
45 | self.filelist = [] | |
46 | self.duplicates = {} | |
47 | self.ratio = 0.85 | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
48 | self.ignore_fileprefix = [] |
3 | 49 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
50 | |
21 | 51 | def reset(self): |
52 | self.filelist = [] | |
53 | self.duplicates = {} | |
3 | 54 | |
33 | 55 | def scandir(self, basedir, extra=[]): |
21 | 56 | """ |
57 | Scan a base directory for movie files and add them to | |
58 | the list for analyze | |
59 | """ | |
60 | self.basedir = basedir | |
61 | print("Scanning directory: %s" % basedir) | |
62 | for root, subdirs, files in os.walk(basedir): | |
63 | for filename in files: | |
64 | ext = os.path.splitext(filename)[1].lower() | |
65 | if ext == ".ts": | |
66 | #file_path = os.path.join(root, filename) | |
67 | title = filename.split(" - ") | |
68 | if len(title) == 1: | |
69 | title = title[0] | |
70 | else: | |
71 | title = " - ".join(title[2:]) | |
72 | title = title[:-3].lower() | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
73 | |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
74 | # remove parentheses with contents in title |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
75 | title = RE_PARENTHESES.sub("", title) |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
76 | |
32 | 77 | self.filelist.append([title, filename, root, ext]) |
21 | 78 | elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: |
79 | title = filename[:-4].lower() | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
80 | title = RE_PARENTHESES.sub("", title) |
32 | 81 | self.filelist.append([title, filename, root, ext]) |
33 | 82 | elif ext in extra: |
83 | title = filename[:-4].lower() | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
84 | title = RE_PARENTHESES.sub("", title) |
33 | 85 | self.filelist.append([title, filename, root, ext]) |
86 | ||
87 | def fixnames(self): | |
88 | """ | |
89 | Search for defect filenames and remove illegal characters | |
90 | """ | |
91 | import re | |
92 | for item in self.filelist: | |
93 | if not item[3] in ['.mkv', '.txt']: | |
94 | continue | |
95 | # any non-alphanumeric characters in filename? | |
96 | cleanfn = replace_all(item[1], { | |
97 | #'ä':'ae', 'Ä':'Ae', | |
98 | #'ö':'oe', 'Ö':'Oe', | |
99 | #'ü':'ue', 'Ü':'Ue', | |
100 | 'ß':'ss', | |
101 | }) | |
102 | cleanfn = re.sub(r'[^A-Za-z0-9\.\_\-\(\)\&öäüÖÄÜ\' ]', '-', cleanfn) | |
103 | if item[1] == cleanfn: | |
104 | continue | |
105 | print (item[1]) | |
106 | os.rename( | |
107 | os.path.join(item[2], item[1]), | |
108 | os.path.join(item[2], cleanfn) | |
109 | ) | |
32 | 110 | |
111 | def statistics(self): | |
112 | """ | |
113 | Summarize disk usage and print stats about found filetypes | |
114 | """ | |
115 | stats = {} | |
116 | for item in self.filelist: | |
117 | if not item[3] in stats: | |
118 | stats[item[3]] = [0, 0.0] | |
119 | stats[item[3]][0] += 1 | |
120 | stats[item[3]][1] += os.stat( | |
121 | os.path.join( | |
122 | item[2], item[1])).st_size | |
123 | print ("%5s %6s %10s" % ( | |
124 | "File:", | |
125 | "Count:", | |
126 | "Size:")) | |
33 | 127 | sum_count = 0 |
128 | sum_size = 0.0 | |
32 | 129 | for ext in stats.keys(): |
33 | 130 | sum_count += stats[ext][0] |
131 | sum_size += stats[ext][1] | |
32 | 132 | print ("%5s %6i %10s" % ( |
133 | ext, stats[ext][0], | |
134 | humansize(stats[ext][1]))) | |
33 | 135 | print ("%5s %6i %10s" % ( |
136 | "TOTAL", sum_count, | |
137 | humansize(sum_size))) | |
32 | 138 | |
21 | 139 | |
140 | def analyze(self): | |
141 | """ | |
142 | Analyze the scanlist for duplicates | |
143 | """ | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
144 | listlen = len(self.filelist) |
21 | 145 | print("%i files to analyze, running duplicate testing loop..." % ( |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
146 | listlen)) |
3 | 147 | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
148 | # remove potentially unwanted entries from the list |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
149 | if len(self.ignore_fileprefix) > 0: |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
150 | for idx in reversed(range(listlen)): |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
151 | for tst in self.ignore_fileprefix: |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
152 | if tst == '': |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
153 | continue |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
154 | if self.filelist[idx][0].startswith(tst): |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
155 | del self.filelist[idx] |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
156 | break |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
157 | listlen = len(self.filelist) |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
158 | |
21 | 159 | for idx in range(listlen): |
160 | if not self.filelist[idx]: | |
161 | continue | |
162 | print("\r%d %s\033[K" % ( | |
163 | idx, self.filelist[idx][0]), end='') | |
164 | sys.stdout.flush() | |
165 | for idx2 in range(idx + 1, listlen): | |
166 | if self.filelist[idx2]: | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
167 | if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio: |
21 | 168 | #print "possible duplicate %d %s" % (idx2, item2[0]) |
169 | key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
170 | if not key in self.duplicates: |
21 | 171 | self.duplicates[key] = [] |
172 | self.duplicates[key].append( | |
173 | os.path.join( | |
174 | self.filelist[idx2][2], | |
175 | self.filelist[idx2][1] | |
176 | )) | |
177 | # unset the found duplicate, so that this will not be scanned again | |
178 | self.filelist[idx2] = None | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
179 | print("\n\n") |
3 | 180 | |
21 | 181 | def output(self): |
182 | """ | |
183 | Dump found duplicates to console | |
184 | """ | |
185 | idx = 1 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
186 | for base in self.duplicates: |
21 | 187 | print("Duplicate file set #%i" % idx) |
188 | print(base) | |
189 | for dup in self.duplicates[base]: | |
190 | print(dup) | |
191 | print() | |
192 | idx += 1 | |
193 | ||
194 | ||
195 | if __name__ == "__main__": | |
196 | # parse command line options | |
197 | import argparse | |
198 | ||
199 | parser = argparse.ArgumentParser(\ | |
200 | description='Movie database filename duplicate checker') | |
201 | parser.add_argument('--ratio', type=float, default=0.85, \ | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
202 | help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
203 | parser.add_argument('--difflib', action='store_true', default=False, \ |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
204 | help='force the use of difflib instead Levenshtein') |
32 | 205 | parser.add_argument('--stats', action='store_true', default=False, \ |
206 | help='generate stats summary instead of check for duplicates') | |
33 | 207 | parser.add_argument('--fixnames', action='store_true', default=False, \ |
208 | help='scan for mkv and txt, fix broken filenames for windows') | |
21 | 209 | parser.add_argument('basedir', metavar='basedir', nargs='+', \ |
210 | help='one or more base directories') | |
211 | ||
212 | args = parser.parse_args() | |
213 | dupe = dupechecker() | |
214 | dupe.ratio = args.ratio | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
215 | if args.difflib: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
216 | DIFFLIB = True |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
217 | import difflib |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
218 | else: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
219 | try: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
220 | import Levenshtein |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
221 | DIFFLIB = False |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
222 | except ImportError: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
223 | import difflib |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
224 | DIFFLIB = True |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
225 | print("Consider 'pip install python-Levenshtein' for faster analyze") |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
226 | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
227 | if os.path.isfile("dupecheck-ignore.txt"): |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
228 | # read the entire file line by line into buffer |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
229 | print("Loading ignore filename prefixes file for dupe checking...") |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
230 | dupe.ignore_fileprefix = [line.rstrip('\n').rstrip('\r') for line in open("dupecheck-ignore.txt", "rb")] |
21 | 231 | |
33 | 232 | if args.fixnames: |
233 | for srcstr in args.basedir: | |
234 | dupe.scandir(srcstr, ['.txt']) | |
235 | if len(dupe.filelist) > 0: | |
236 | print ("Checking %i file names..." % len(dupe.filelist)) | |
237 | dupe.fixnames() | |
238 | dupe.filelist = [] | |
239 | sys.exit(0) | |
240 | ||
21 | 241 | for srcstr in args.basedir: |
242 | dupe.scandir(srcstr) | |
33 | 243 | |
244 | if args.stats or args.fixnames: | |
32 | 245 | dupe.statistics() |
246 | else: | |
247 | dupe.analyze() | |
248 | dupe.output() | |
33 | 249 |