14 def similarity(a, b): |
14 def similarity(a, b): |
15 if DIFFLIB: |
15 if DIFFLIB: |
16 return difflib.SequenceMatcher(a=a, b=b).ratio() |
16 return difflib.SequenceMatcher(a=a, b=b).ratio() |
17 else: |
17 else: |
18 return Levenshtein.ratio(a, b) |
18 return Levenshtein.ratio(a, b) |
|
19 |
|
20 suffixes = ['b', 'K', 'M', 'G', 'T', 'P'] |
|
21 def humansize(nbytes): |
|
22 i = 0 |
|
23 while nbytes >= 1024 and i < len(suffixes)-1: |
|
24 nbytes /= 1024. |
|
25 i += 1 |
|
26 f = ('%.2f' % nbytes).rstrip('0').rstrip('.') |
|
27 return '%s %s' % (f, suffixes[i]) |
19 |
28 |
20 class dupechecker(object): |
29 class dupechecker(object): |
21 """ |
30 """ |
22 Simple class to scan multiple directories recursive, |
31 Simple class to scan multiple directories recursive, |
23 build a list of movie filenames. |
32 build a list of movie filenames. |
50 if len(title) == 1: |
59 if len(title) == 1: |
51 title = title[0] |
60 title = title[0] |
52 else: |
61 else: |
53 title = " - ".join(title[2:]) |
62 title = " - ".join(title[2:]) |
54 title = title[:-3].lower() |
63 title = title[:-3].lower() |
55 self.filelist.append([title, filename, root]) |
64 self.filelist.append([title, filename, root, ext]) |
56 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: |
65 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: |
57 title = filename[:-4].lower() |
66 title = filename[:-4].lower() |
58 self.filelist.append([title, filename, root]) |
67 self.filelist.append([title, filename, root, ext]) |
|
68 |
|
69 def statistics(self): |
|
70 """ |
|
71 Summarize disk usage and print stats about found filetypes |
|
72 """ |
|
73 stats = {} |
|
74 for item in self.filelist: |
|
75 if not item[3] in stats: |
|
76 stats[item[3]] = [0, 0.0] |
|
77 stats[item[3]][0] += 1 |
|
78 stats[item[3]][1] += os.stat( |
|
79 os.path.join( |
|
80 item[2], item[1])).st_size |
|
81 print ("%5s %6s %10s" % ( |
|
82 "File:", |
|
83 "Count:", |
|
84 "Size:")) |
|
85 for ext in stats.keys(): |
|
86 print ("%5s %6i %10s" % ( |
|
87 ext, stats[ext][0], |
|
88 humansize(stats[ext][1]))) |
|
89 |
59 |
90 |
60 def analyze(self): |
91 def analyze(self): |
61 """ |
92 """ |
62 Analyze the scanlist for duplicates |
93 Analyze the scanlist for duplicates |
63 """ |
94 """ |
109 description='Movie database filename duplicate checker') |
140 description='Movie database filename duplicate checker') |
110 parser.add_argument('--ratio', type=float, default=0.85, \ |
141 parser.add_argument('--ratio', type=float, default=0.85, \ |
111 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') |
142 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') |
112 parser.add_argument('--difflib', action='store_true', default=False, \ |
143 parser.add_argument('--difflib', action='store_true', default=False, \ |
113 help='force the use of difflib instead Levenshtein') |
144 help='force the use of difflib instead Levenshtein') |
|
145 parser.add_argument('--stats', action='store_true', default=False, \ |
|
146 help='generate stats summary instead of check for duplicates') |
114 parser.add_argument('basedir', metavar='basedir', nargs='+', \ |
147 parser.add_argument('basedir', metavar='basedir', nargs='+', \ |
115 help='one or more base directories') |
148 help='one or more base directories') |
116 |
149 |
117 args = parser.parse_args() |
150 args = parser.parse_args() |
118 dupe = dupechecker() |
151 dupe = dupechecker() |