dupecheck.py

Wed, 29 Nov 2017 23:04:52 +0100

author
mdd
date
Wed, 29 Nov 2017 23:04:52 +0100
changeset 21
1c0beeca2f9c
parent 15
82361ad7b3fe
child 22
c18abd9198c0
permissions
-rwxr-xr-x

cleanup dupechecker

3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
1 #!/usr/bin/env python
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
2 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
3 Toolkit / executable to scan for duplicate filenames in movie database
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
4
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
5 2017 by mdd
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
6 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
7
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
8 #pylint: disable=line-too-long
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
9 #pylint: disable=invalid-name
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
10
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
11 from __future__ import print_function
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
12 import difflib
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
13 import os, sys
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
14
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
15 class dupechecker(object):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
16 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
17 Simple class to scan multiple directories recursive,
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
18 build a list of movie filenames.
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
19 analyze the list for duplicates and dump them
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
20 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
21 def __init__(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
22 self.basedir = ""
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
23 self.filelist = []
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
24 self.duplicates = {}
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
25 self.ratio = 0.85
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
26
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
27 def reset(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
28 self.filelist = []
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
29 self.duplicates = {}
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
30
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
31 def scandir(self, basedir):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
32 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
33 Scan a base directory for movie files and add them to
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
34 the list for analyze
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
35 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
36 self.basedir = basedir
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
37 print("Scanning directory: %s" % basedir)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
38 for root, subdirs, files in os.walk(basedir):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
39 for filename in files:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
40 ext = os.path.splitext(filename)[1].lower()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
41 if ext == ".ts":
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
42 #file_path = os.path.join(root, filename)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
43 title = filename.split(" - ")
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
44 if len(title) == 1:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
45 title = title[0]
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
46 else:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
47 title = " - ".join(title[2:])
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
48 title = title[:-3].lower()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
49 self.filelist.append([title, filename, root])
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
50 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
51 title = filename[:-4].lower()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
52 self.filelist.append([title, filename, root])
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
53
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
54 def analyze(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
55 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
56 Analyze the scanlist for duplicates
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
57 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
58 print("%i files to analyze, running duplicate testing loop..." % (
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
59 len(self.filelist)))
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
60
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
61 listlen = len(self.filelist)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
62 for idx in range(listlen):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
63 if not self.filelist[idx]:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
64 continue
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
65 print("\r%d %s\033[K" % (
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
66 idx, self.filelist[idx][0]), end='')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
67 sys.stdout.flush()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
68 for idx2 in range(idx + 1, listlen):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
69 if self.filelist[idx2]:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
70 if difflib.SequenceMatcher(a=self.filelist[idx][0], b=self.filelist[idx2][0]).ratio() > self.ratio:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
71 #print "possible duplicate %d %s" % (idx2, item2[0])
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
72 key = os.path.join(self.filelist[idx][2], self.filelist[idx][1])
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
73 if not key in self.duplicates.keys():
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
74 self.duplicates[key] = []
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
75 self.duplicates[key].append(
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
76 os.path.join(
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
77 self.filelist[idx2][2],
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
78 self.filelist[idx2][1]
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
79 ))
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
80 # unset the found duplicate, so that this will not be scanned again
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
81 self.filelist[idx2] = None
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
82 print("\n\n\n")
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
83
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
84 def output(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
85 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
86 Dump found duplicates to console
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
87 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
88 idx = 1
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
89 for base in self.duplicates.keys():
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
90 print("Duplicate file set #%i" % idx)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
91 print(base)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
92 for dup in self.duplicates[base]:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
93 print(dup)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
94 print()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
95 idx += 1
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
96
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
97
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
98 if __name__ == "__main__":
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
99 # parse command line options
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
100 import argparse
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
101
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
102 parser = argparse.ArgumentParser(\
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
103 description='Movie database filename duplicate checker')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
104 parser.add_argument('--ratio', type=float, default=0.85, \
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
105 help='filename duplicate threshold 0.1 < ratio 1.0')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
106 parser.add_argument('basedir', metavar='basedir', nargs='+', \
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
107 help='one or more base directories')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
108
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
109 args = parser.parse_args()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
110 dupe = dupechecker()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
111 dupe.ratio = args.ratio
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
112
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
113 for srcstr in args.basedir:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
114 dupe.scandir(srcstr)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
115 dupe.analyze()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
116 dupe.output()

mercurial