Tue, 12 Dec 2017 03:25:44 +0100
added stats calc to dupechecker
3 | 1 | #!/usr/bin/env python |
21 | 2 | """ |
3 | Toolkit / executable to scan for duplicate filenames in movie database | |
4 | ||
5 | 2017 by mdd | |
6 | """ | |
7 | ||
8 | #pylint: disable=line-too-long | |
9 | #pylint: disable=invalid-name | |
3 | 10 | |
4 | 11 | from __future__ import print_function |
3 | 12 | import os, sys |
13 | ||
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
14 | def similarity(a, b): |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
15 | if DIFFLIB: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
16 | return difflib.SequenceMatcher(a=a, b=b).ratio() |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
17 | else: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
18 | return Levenshtein.ratio(a, b) |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
19 | |
32 | 20 | suffixes = ['b', 'K', 'M', 'G', 'T', 'P'] |
21 | def humansize(nbytes): | |
22 | i = 0 | |
23 | while nbytes >= 1024 and i < len(suffixes)-1: | |
24 | nbytes /= 1024. | |
25 | i += 1 | |
26 | f = ('%.2f' % nbytes).rstrip('0').rstrip('.') | |
27 | return '%s %s' % (f, suffixes[i]) | |
28 | ||
21 | 29 | class dupechecker(object): |
30 | """ | |
31 | Simple class to scan multiple directories recursive, | |
32 | build a list of movie filenames. | |
33 | analyze the list for duplicates and dump them | |
34 | """ | |
35 | def __init__(self): | |
36 | self.basedir = "" | |
37 | self.filelist = [] | |
38 | self.duplicates = {} | |
39 | self.ratio = 0.85 | |
3 | 40 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
41 | |
21 | 42 | def reset(self): |
43 | self.filelist = [] | |
44 | self.duplicates = {} | |
3 | 45 | |
21 | 46 | def scandir(self, basedir): |
47 | """ | |
48 | Scan a base directory for movie files and add them to | |
49 | the list for analyze | |
50 | """ | |
51 | self.basedir = basedir | |
52 | print("Scanning directory: %s" % basedir) | |
53 | for root, subdirs, files in os.walk(basedir): | |
54 | for filename in files: | |
55 | ext = os.path.splitext(filename)[1].lower() | |
56 | if ext == ".ts": | |
57 | #file_path = os.path.join(root, filename) | |
58 | title = filename.split(" - ") | |
59 | if len(title) == 1: | |
60 | title = title[0] | |
61 | else: | |
62 | title = " - ".join(title[2:]) | |
63 | title = title[:-3].lower() | |
32 | 64 | self.filelist.append([title, filename, root, ext]) |
21 | 65 | elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: |
66 | title = filename[:-4].lower() | |
32 | 67 | self.filelist.append([title, filename, root, ext]) |
68 | ||
69 | def statistics(self): | |
70 | """ | |
71 | Summarize disk usage and print stats about found filetypes | |
72 | """ | |
73 | stats = {} | |
74 | for item in self.filelist: | |
75 | if not item[3] in stats: | |
76 | stats[item[3]] = [0, 0.0] | |
77 | stats[item[3]][0] += 1 | |
78 | stats[item[3]][1] += os.stat( | |
79 | os.path.join( | |
80 | item[2], item[1])).st_size | |
81 | print ("%5s %6s %10s" % ( | |
82 | "File:", | |
83 | "Count:", | |
84 | "Size:")) | |
85 | for ext in stats.keys(): | |
86 | print ("%5s %6i %10s" % ( | |
87 | ext, stats[ext][0], | |
88 | humansize(stats[ext][1]))) | |
89 | ||
21 | 90 | |
91 | def analyze(self): | |
92 | """ | |
93 | Analyze the scanlist for duplicates | |
94 | """ | |
95 | print("%i files to analyze, running duplicate testing loop..." % ( | |
96 | len(self.filelist))) | |
3 | 97 | |
21 | 98 | listlen = len(self.filelist) |
99 | for idx in range(listlen): | |
100 | if not self.filelist[idx]: | |
101 | continue | |
102 | print("\r%d %s\033[K" % ( | |
103 | idx, self.filelist[idx][0]), end='') | |
104 | sys.stdout.flush() | |
105 | for idx2 in range(idx + 1, listlen): | |
106 | if self.filelist[idx2]: | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
107 | if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio: |
21 | 108 | #print "possible duplicate %d %s" % (idx2, item2[0]) |
109 | key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
110 | if not key in self.duplicates: |
21 | 111 | self.duplicates[key] = [] |
112 | self.duplicates[key].append( | |
113 | os.path.join( | |
114 | self.filelist[idx2][2], | |
115 | self.filelist[idx2][1] | |
116 | )) | |
117 | # unset the found duplicate, so that this will not be scanned again | |
118 | self.filelist[idx2] = None | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
119 | print("\n\n") |
3 | 120 | |
21 | 121 | def output(self): |
122 | """ | |
123 | Dump found duplicates to console | |
124 | """ | |
125 | idx = 1 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
126 | for base in self.duplicates: |
21 | 127 | print("Duplicate file set #%i" % idx) |
128 | print(base) | |
129 | for dup in self.duplicates[base]: | |
130 | print(dup) | |
131 | print() | |
132 | idx += 1 | |
133 | ||
134 | ||
135 | if __name__ == "__main__": | |
136 | # parse command line options | |
137 | import argparse | |
138 | ||
139 | parser = argparse.ArgumentParser(\ | |
140 | description='Movie database filename duplicate checker') | |
141 | parser.add_argument('--ratio', type=float, default=0.85, \ | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
142 | help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
143 | parser.add_argument('--difflib', action='store_true', default=False, \ |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
144 | help='force the use of difflib instead Levenshtein') |
32 | 145 | parser.add_argument('--stats', action='store_true', default=False, \ |
146 | help='generate stats summary instead of check for duplicates') | |
21 | 147 | parser.add_argument('basedir', metavar='basedir', nargs='+', \ |
148 | help='one or more base directories') | |
149 | ||
150 | args = parser.parse_args() | |
151 | dupe = dupechecker() | |
152 | dupe.ratio = args.ratio | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
153 | if args.difflib: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
154 | DIFFLIB = True |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
155 | import difflib |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
156 | else: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
157 | try: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
158 | import Levenshtein |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
159 | DIFFLIB = False |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
160 | except ImportError: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
161 | import difflib |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
162 | DIFFLIB = True |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
163 | print("Consider 'pip install python-Levenshtein' for faster analyze") |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
164 | |
21 | 165 | |
166 | for srcstr in args.basedir: | |
167 | dupe.scandir(srcstr) | |
32 | 168 | if args.stats: |
169 | dupe.statistics() | |
170 | else: | |
171 | dupe.analyze() | |
172 | dupe.output() |