dupecheck.py

Tue, 12 Dec 2017 03:25:44 +0100

author
mdd
date
Tue, 12 Dec 2017 03:25:44 +0100
changeset 32
df89a8fba2a2
parent 22
c18abd9198c0
child 33
83bcb5931ee3
permissions
-rwxr-xr-x

added stats calc to dupechecker

3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
1 #!/usr/bin/env python
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
2 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
3 Toolkit / executable to scan for duplicate filenames in movie database
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
4
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
5 2017 by mdd
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
6 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
7
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
8 #pylint: disable=line-too-long
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
9 #pylint: disable=invalid-name
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
10
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
11 from __future__ import print_function
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
12 import os, sys
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
13
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
14 def similarity(a, b):
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
15 if DIFFLIB:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
16 return difflib.SequenceMatcher(a=a, b=b).ratio()
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
17 else:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
18 return Levenshtein.ratio(a, b)
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
19
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
20 suffixes = ['b', 'K', 'M', 'G', 'T', 'P']
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
21 def humansize(nbytes):
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
22 i = 0
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
23 while nbytes >= 1024 and i < len(suffixes)-1:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
24 nbytes /= 1024.
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
25 i += 1
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
26 f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
27 return '%s %s' % (f, suffixes[i])
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
28
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
29 class dupechecker(object):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
30 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
31 Simple class to scan multiple directories recursive,
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
32 build a list of movie filenames.
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
33 analyze the list for duplicates and dump them
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
34 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
35 def __init__(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
36 self.basedir = ""
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
37 self.filelist = []
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
38 self.duplicates = {}
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
39 self.ratio = 0.85
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
40
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
41
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
42 def reset(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
43 self.filelist = []
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
44 self.duplicates = {}
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
45
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
46 def scandir(self, basedir):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
47 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
48 Scan a base directory for movie files and add them to
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
49 the list for analyze
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
50 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
51 self.basedir = basedir
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
52 print("Scanning directory: %s" % basedir)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
53 for root, subdirs, files in os.walk(basedir):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
54 for filename in files:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
55 ext = os.path.splitext(filename)[1].lower()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
56 if ext == ".ts":
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
57 #file_path = os.path.join(root, filename)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
58 title = filename.split(" - ")
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
59 if len(title) == 1:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
60 title = title[0]
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
61 else:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
62 title = " - ".join(title[2:])
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
63 title = title[:-3].lower()
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
64 self.filelist.append([title, filename, root, ext])
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
65 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
66 title = filename[:-4].lower()
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
67 self.filelist.append([title, filename, root, ext])
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
68
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
69 def statistics(self):
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
70 """
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
71 Summarize disk usage and print stats about found filetypes
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
72 """
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
73 stats = {}
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
74 for item in self.filelist:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
75 if not item[3] in stats:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
76 stats[item[3]] = [0, 0.0]
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
77 stats[item[3]][0] += 1
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
78 stats[item[3]][1] += os.stat(
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
79 os.path.join(
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
80 item[2], item[1])).st_size
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
81 print ("%5s %6s %10s" % (
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
82 "File:",
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
83 "Count:",
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
84 "Size:"))
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
85 for ext in stats.keys():
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
86 print ("%5s %6i %10s" % (
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
87 ext, stats[ext][0],
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
88 humansize(stats[ext][1])))
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
89
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
90
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
91 def analyze(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
92 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
93 Analyze the scanlist for duplicates
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
94 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
95 print("%i files to analyze, running duplicate testing loop..." % (
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
96 len(self.filelist)))
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
97
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
98 listlen = len(self.filelist)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
99 for idx in range(listlen):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
100 if not self.filelist[idx]:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
101 continue
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
102 print("\r%d %s\033[K" % (
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
103 idx, self.filelist[idx][0]), end='')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
104 sys.stdout.flush()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
105 for idx2 in range(idx + 1, listlen):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
106 if self.filelist[idx2]:
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
107 if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio:
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
108 #print "possible duplicate %d %s" % (idx2, item2[0])
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
109 key = os.path.join(self.filelist[idx][2], self.filelist[idx][1])
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
110 if not key in self.duplicates:
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
111 self.duplicates[key] = []
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
112 self.duplicates[key].append(
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
113 os.path.join(
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
114 self.filelist[idx2][2],
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
115 self.filelist[idx2][1]
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
116 ))
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
117 # unset the found duplicate, so that this will not be scanned again
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
118 self.filelist[idx2] = None
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
119 print("\n\n")
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
120
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
121 def output(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
122 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
123 Dump found duplicates to console
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
124 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
125 idx = 1
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
126 for base in self.duplicates:
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
127 print("Duplicate file set #%i" % idx)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
128 print(base)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
129 for dup in self.duplicates[base]:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
130 print(dup)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
131 print()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
132 idx += 1
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
133
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
134
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
135 if __name__ == "__main__":
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
136 # parse command line options
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
137 import argparse
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
138
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
139 parser = argparse.ArgumentParser(\
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
140 description='Movie database filename duplicate checker')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
141 parser.add_argument('--ratio', type=float, default=0.85, \
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
142 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)')
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
143 parser.add_argument('--difflib', action='store_true', default=False, \
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
144 help='force the use of difflib instead Levenshtein')
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
145 parser.add_argument('--stats', action='store_true', default=False, \
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
146 help='generate stats summary instead of check for duplicates')
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
147 parser.add_argument('basedir', metavar='basedir', nargs='+', \
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
148 help='one or more base directories')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
149
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
150 args = parser.parse_args()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
151 dupe = dupechecker()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
152 dupe.ratio = args.ratio
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
153 if args.difflib:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
154 DIFFLIB = True
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
155 import difflib
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
156 else:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
157 try:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
158 import Levenshtein
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
159 DIFFLIB = False
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
160 except ImportError:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
161 import difflib
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
162 DIFFLIB = True
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
163 print("Consider 'pip install python-Levenshtein' for faster analyze")
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
164
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
165
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
166 for srcstr in args.basedir:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
167 dupe.scandir(srcstr)
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
168 if args.stats:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
169 dupe.statistics()
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
170 else:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
171 dupe.analyze()
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
172 dupe.output()

mercurial