Thu, 04 Oct 2018 00:43:26 +0200
fix: cropdetect
3 | 1 | #!/usr/bin/env python |
33 | 2 | # -*- coding: utf-8 -*- |
21 | 3 | """ |
4 | Toolkit / executable to scan for duplicate filenames in movie database | |
5 | ||
6 | 2017 by mdd | |
7 | """ | |
8 | ||
9 | #pylint: disable=line-too-long | |
10 | #pylint: disable=invalid-name | |
3 | 11 | |
4 | 12 | from __future__ import print_function |
3 | 13 | import os, sys |
14 | ||
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
15 | def similarity(a, b): |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
16 | if DIFFLIB: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
17 | return difflib.SequenceMatcher(a=a, b=b).ratio() |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
18 | else: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
19 | return Levenshtein.ratio(a, b) |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
20 | |
32 | 21 | suffixes = ['b', 'K', 'M', 'G', 'T', 'P'] |
22 | def humansize(nbytes): | |
23 | i = 0 | |
24 | while nbytes >= 1024 and i < len(suffixes)-1: | |
25 | nbytes /= 1024. | |
26 | i += 1 | |
27 | f = ('%.2f' % nbytes).rstrip('0').rstrip('.') | |
28 | return '%s %s' % (f, suffixes[i]) | |
29 | ||
33 | 30 | def replace_all(text, dic): |
31 | for i, j in dic.iteritems(): | |
32 | text = text.replace(i, j) | |
33 | return text | |
34 | ||
21 | 35 | class dupechecker(object): |
36 | """ | |
37 | Simple class to scan multiple directories recursive, | |
38 | build a list of movie filenames. | |
39 | analyze the list for duplicates and dump them | |
40 | """ | |
41 | def __init__(self): | |
42 | self.basedir = "" | |
43 | self.filelist = [] | |
44 | self.duplicates = {} | |
45 | self.ratio = 0.85 | |
3 | 46 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
47 | |
21 | 48 | def reset(self): |
49 | self.filelist = [] | |
50 | self.duplicates = {} | |
3 | 51 | |
33 | 52 | def scandir(self, basedir, extra=[]): |
21 | 53 | """ |
54 | Scan a base directory for movie files and add them to | |
55 | the list for analyze | |
56 | """ | |
57 | self.basedir = basedir | |
58 | print("Scanning directory: %s" % basedir) | |
59 | for root, subdirs, files in os.walk(basedir): | |
60 | for filename in files: | |
61 | ext = os.path.splitext(filename)[1].lower() | |
62 | if ext == ".ts": | |
63 | #file_path = os.path.join(root, filename) | |
64 | title = filename.split(" - ") | |
65 | if len(title) == 1: | |
66 | title = title[0] | |
67 | else: | |
68 | title = " - ".join(title[2:]) | |
69 | title = title[:-3].lower() | |
32 | 70 | self.filelist.append([title, filename, root, ext]) |
21 | 71 | elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: |
72 | title = filename[:-4].lower() | |
32 | 73 | self.filelist.append([title, filename, root, ext]) |
33 | 74 | elif ext in extra: |
75 | title = filename[:-4].lower() | |
76 | self.filelist.append([title, filename, root, ext]) | |
77 | ||
78 | def fixnames(self): | |
79 | """ | |
80 | Search for defect filenames and remove illegal characters | |
81 | """ | |
82 | import re | |
83 | for item in self.filelist: | |
84 | if not item[3] in ['.mkv', '.txt']: | |
85 | continue | |
86 | # any non-alphanumeric characters in filename? | |
87 | cleanfn = replace_all(item[1], { | |
88 | #'ä':'ae', 'Ä':'Ae', | |
89 | #'ö':'oe', 'Ö':'Oe', | |
90 | #'ü':'ue', 'Ü':'Ue', | |
91 | 'ß':'ss', | |
92 | }) | |
93 | cleanfn = re.sub(r'[^A-Za-z0-9\.\_\-\(\)\&öäüÖÄÜ\' ]', '-', cleanfn) | |
94 | if item[1] == cleanfn: | |
95 | continue | |
96 | print (item[1]) | |
97 | os.rename( | |
98 | os.path.join(item[2], item[1]), | |
99 | os.path.join(item[2], cleanfn) | |
100 | ) | |
32 | 101 | |
102 | def statistics(self): | |
103 | """ | |
104 | Summarize disk usage and print stats about found filetypes | |
105 | """ | |
106 | stats = {} | |
107 | for item in self.filelist: | |
108 | if not item[3] in stats: | |
109 | stats[item[3]] = [0, 0.0] | |
110 | stats[item[3]][0] += 1 | |
111 | stats[item[3]][1] += os.stat( | |
112 | os.path.join( | |
113 | item[2], item[1])).st_size | |
114 | print ("%5s %6s %10s" % ( | |
115 | "File:", | |
116 | "Count:", | |
117 | "Size:")) | |
33 | 118 | sum_count = 0 |
119 | sum_size = 0.0 | |
32 | 120 | for ext in stats.keys(): |
33 | 121 | sum_count += stats[ext][0] |
122 | sum_size += stats[ext][1] | |
32 | 123 | print ("%5s %6i %10s" % ( |
124 | ext, stats[ext][0], | |
125 | humansize(stats[ext][1]))) | |
33 | 126 | print ("%5s %6i %10s" % ( |
127 | "TOTAL", sum_count, | |
128 | humansize(sum_size))) | |
32 | 129 | |
21 | 130 | |
131 | def analyze(self): | |
132 | """ | |
133 | Analyze the scanlist for duplicates | |
134 | """ | |
135 | print("%i files to analyze, running duplicate testing loop..." % ( | |
136 | len(self.filelist))) | |
3 | 137 | |
21 | 138 | listlen = len(self.filelist) |
139 | for idx in range(listlen): | |
140 | if not self.filelist[idx]: | |
141 | continue | |
142 | print("\r%d %s\033[K" % ( | |
143 | idx, self.filelist[idx][0]), end='') | |
144 | sys.stdout.flush() | |
145 | for idx2 in range(idx + 1, listlen): | |
146 | if self.filelist[idx2]: | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
147 | if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio: |
21 | 148 | #print "possible duplicate %d %s" % (idx2, item2[0]) |
149 | key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
150 | if not key in self.duplicates: |
21 | 151 | self.duplicates[key] = [] |
152 | self.duplicates[key].append( | |
153 | os.path.join( | |
154 | self.filelist[idx2][2], | |
155 | self.filelist[idx2][1] | |
156 | )) | |
157 | # unset the found duplicate, so that this will not be scanned again | |
158 | self.filelist[idx2] = None | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
159 | print("\n\n") |
3 | 160 | |
21 | 161 | def output(self): |
162 | """ | |
163 | Dump found duplicates to console | |
164 | """ | |
165 | idx = 1 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
166 | for base in self.duplicates: |
21 | 167 | print("Duplicate file set #%i" % idx) |
168 | print(base) | |
169 | for dup in self.duplicates[base]: | |
170 | print(dup) | |
171 | print() | |
172 | idx += 1 | |
173 | ||
174 | ||
175 | if __name__ == "__main__": | |
176 | # parse command line options | |
177 | import argparse | |
178 | ||
179 | parser = argparse.ArgumentParser(\ | |
180 | description='Movie database filename duplicate checker') | |
181 | parser.add_argument('--ratio', type=float, default=0.85, \ | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
182 | help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
183 | parser.add_argument('--difflib', action='store_true', default=False, \ |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
184 | help='force the use of difflib instead Levenshtein') |
32 | 185 | parser.add_argument('--stats', action='store_true', default=False, \ |
186 | help='generate stats summary instead of check for duplicates') | |
33 | 187 | parser.add_argument('--fixnames', action='store_true', default=False, \ |
188 | help='scan for mkv and txt, fix broken filenames for windows') | |
21 | 189 | parser.add_argument('basedir', metavar='basedir', nargs='+', \ |
190 | help='one or more base directories') | |
191 | ||
192 | args = parser.parse_args() | |
193 | dupe = dupechecker() | |
194 | dupe.ratio = args.ratio | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
195 | if args.difflib: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
196 | DIFFLIB = True |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
197 | import difflib |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
198 | else: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
199 | try: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
200 | import Levenshtein |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
201 | DIFFLIB = False |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
202 | except ImportError: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
203 | import difflib |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
204 | DIFFLIB = True |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
205 | print("Consider 'pip install python-Levenshtein' for faster analyze") |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
206 | |
21 | 207 | |
33 | 208 | if args.fixnames: |
209 | for srcstr in args.basedir: | |
210 | dupe.scandir(srcstr, ['.txt']) | |
211 | if len(dupe.filelist) > 0: | |
212 | print ("Checking %i file names..." % len(dupe.filelist)) | |
213 | dupe.fixnames() | |
214 | dupe.filelist = [] | |
215 | sys.exit(0) | |
216 | ||
21 | 217 | for srcstr in args.basedir: |
218 | dupe.scandir(srcstr) | |
33 | 219 | |
220 | if args.stats or args.fixnames: | |
32 | 221 | dupe.statistics() |
222 | else: | |
223 | dupe.analyze() | |
224 | dupe.output() | |
33 | 225 |