Wed, 29 Nov 2017 23:04:52 +0100
cleanup dupechecker
3 | 1 | #!/usr/bin/env python |
21 | 2 | """ |
3 | Toolkit / executable to scan for duplicate filenames in movie database | |
4 | ||
5 | 2017 by mdd | |
6 | """ | |
7 | ||
8 | #pylint: disable=line-too-long | |
9 | #pylint: disable=invalid-name | |
3 | 10 | |
4 | 11 | from __future__ import print_function |
3 | 12 | import difflib |
13 | import os, sys | |
14 | ||
21 | 15 | class dupechecker(object): |
16 | """ | |
17 | Simple class to scan multiple directories recursive, | |
18 | build a list of movie filenames. | |
19 | analyze the list for duplicates and dump them | |
20 | """ | |
21 | def __init__(self): | |
22 | self.basedir = "" | |
23 | self.filelist = [] | |
24 | self.duplicates = {} | |
25 | self.ratio = 0.85 | |
3 | 26 | |
21 | 27 | def reset(self): |
28 | self.filelist = [] | |
29 | self.duplicates = {} | |
3 | 30 | |
21 | 31 | def scandir(self, basedir): |
32 | """ | |
33 | Scan a base directory for movie files and add them to | |
34 | the list for analyze | |
35 | """ | |
36 | self.basedir = basedir | |
37 | print("Scanning directory: %s" % basedir) | |
38 | for root, subdirs, files in os.walk(basedir): | |
39 | for filename in files: | |
40 | ext = os.path.splitext(filename)[1].lower() | |
41 | if ext == ".ts": | |
42 | #file_path = os.path.join(root, filename) | |
43 | title = filename.split(" - ") | |
44 | if len(title) == 1: | |
45 | title = title[0] | |
46 | else: | |
47 | title = " - ".join(title[2:]) | |
48 | title = title[:-3].lower() | |
49 | self.filelist.append([title, filename, root]) | |
50 | elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: | |
51 | title = filename[:-4].lower() | |
52 | self.filelist.append([title, filename, root]) | |
53 | ||
54 | def analyze(self): | |
55 | """ | |
56 | Analyze the scanlist for duplicates | |
57 | """ | |
58 | print("%i files to analyze, running duplicate testing loop..." % ( | |
59 | len(self.filelist))) | |
3 | 60 | |
21 | 61 | listlen = len(self.filelist) |
62 | for idx in range(listlen): | |
63 | if not self.filelist[idx]: | |
64 | continue | |
65 | print("\r%d %s\033[K" % ( | |
66 | idx, self.filelist[idx][0]), end='') | |
67 | sys.stdout.flush() | |
68 | for idx2 in range(idx + 1, listlen): | |
69 | if self.filelist[idx2]: | |
70 | if difflib.SequenceMatcher(a=self.filelist[idx][0], b=self.filelist[idx2][0]).ratio() > self.ratio: | |
71 | #print "possible duplicate %d %s" % (idx2, item2[0]) | |
72 | key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) | |
73 | if not key in self.duplicates.keys(): | |
74 | self.duplicates[key] = [] | |
75 | self.duplicates[key].append( | |
76 | os.path.join( | |
77 | self.filelist[idx2][2], | |
78 | self.filelist[idx2][1] | |
79 | )) | |
80 | # unset the found duplicate, so that this will not be scanned again | |
81 | self.filelist[idx2] = None | |
82 | print("\n\n\n") | |
3 | 83 | |
21 | 84 | def output(self): |
85 | """ | |
86 | Dump found duplicates to console | |
87 | """ | |
88 | idx = 1 | |
89 | for base in self.duplicates.keys(): | |
90 | print("Duplicate file set #%i" % idx) | |
91 | print(base) | |
92 | for dup in self.duplicates[base]: | |
93 | print(dup) | |
94 | print() | |
95 | idx += 1 | |
96 | ||
97 | ||
98 | if __name__ == "__main__": | |
99 | # parse command line options | |
100 | import argparse | |
101 | ||
102 | parser = argparse.ArgumentParser(\ | |
103 | description='Movie database filename duplicate checker') | |
104 | parser.add_argument('--ratio', type=float, default=0.85, \ | |
105 | help='filename duplicate threshold 0.1 < ratio 1.0') | |
106 | parser.add_argument('basedir', metavar='basedir', nargs='+', \ | |
107 | help='one or more base directories') | |
108 | ||
109 | args = parser.parse_args() | |
110 | dupe = dupechecker() | |
111 | dupe.ratio = args.ratio | |
112 | ||
113 | for srcstr in args.basedir: | |
114 | dupe.scandir(srcstr) | |
115 | dupe.analyze() | |
116 | dupe.output() |