Tue, 12 Dec 2017 01:26:51 +0100
Added tag V1.1 for changeset a2951f7c435e
3 | 1 | #!/usr/bin/env python |
21 | 2 | """ |
3 | Toolkit / executable to scan for duplicate filenames in movie database | |
4 | ||
5 | 2017 by mdd | |
6 | """ | |
7 | ||
8 | #pylint: disable=line-too-long | |
9 | #pylint: disable=invalid-name | |
3 | 10 | |
4 | 11 | from __future__ import print_function |
3 | 12 | import os, sys |
13 | ||
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
14 | def similarity(a, b): |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
15 | if DIFFLIB: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
16 | return difflib.SequenceMatcher(a=a, b=b).ratio() |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
17 | else: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
18 | return Levenshtein.ratio(a, b) |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
19 | |
21 | 20 | class dupechecker(object): |
21 | """ | |
22 | Simple class to scan multiple directories recursive, | |
23 | build a list of movie filenames. | |
24 | analyze the list for duplicates and dump them | |
25 | """ | |
26 | def __init__(self): | |
27 | self.basedir = "" | |
28 | self.filelist = [] | |
29 | self.duplicates = {} | |
30 | self.ratio = 0.85 | |
3 | 31 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
32 | |
21 | 33 | def reset(self): |
34 | self.filelist = [] | |
35 | self.duplicates = {} | |
3 | 36 | |
21 | 37 | def scandir(self, basedir): |
38 | """ | |
39 | Scan a base directory for movie files and add them to | |
40 | the list for analyze | |
41 | """ | |
42 | self.basedir = basedir | |
43 | print("Scanning directory: %s" % basedir) | |
44 | for root, subdirs, files in os.walk(basedir): | |
45 | for filename in files: | |
46 | ext = os.path.splitext(filename)[1].lower() | |
47 | if ext == ".ts": | |
48 | #file_path = os.path.join(root, filename) | |
49 | title = filename.split(" - ") | |
50 | if len(title) == 1: | |
51 | title = title[0] | |
52 | else: | |
53 | title = " - ".join(title[2:]) | |
54 | title = title[:-3].lower() | |
55 | self.filelist.append([title, filename, root]) | |
56 | elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: | |
57 | title = filename[:-4].lower() | |
58 | self.filelist.append([title, filename, root]) | |
59 | ||
60 | def analyze(self): | |
61 | """ | |
62 | Analyze the scanlist for duplicates | |
63 | """ | |
64 | print("%i files to analyze, running duplicate testing loop..." % ( | |
65 | len(self.filelist))) | |
3 | 66 | |
21 | 67 | listlen = len(self.filelist) |
68 | for idx in range(listlen): | |
69 | if not self.filelist[idx]: | |
70 | continue | |
71 | print("\r%d %s\033[K" % ( | |
72 | idx, self.filelist[idx][0]), end='') | |
73 | sys.stdout.flush() | |
74 | for idx2 in range(idx + 1, listlen): | |
75 | if self.filelist[idx2]: | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
76 | if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio: |
21 | 77 | #print "possible duplicate %d %s" % (idx2, item2[0]) |
78 | key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
79 | if not key in self.duplicates: |
21 | 80 | self.duplicates[key] = [] |
81 | self.duplicates[key].append( | |
82 | os.path.join( | |
83 | self.filelist[idx2][2], | |
84 | self.filelist[idx2][1] | |
85 | )) | |
86 | # unset the found duplicate, so that this will not be scanned again | |
87 | self.filelist[idx2] = None | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
88 | print("\n\n") |
3 | 89 | |
21 | 90 | def output(self): |
91 | """ | |
92 | Dump found duplicates to console | |
93 | """ | |
94 | idx = 1 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
95 | for base in self.duplicates: |
21 | 96 | print("Duplicate file set #%i" % idx) |
97 | print(base) | |
98 | for dup in self.duplicates[base]: | |
99 | print(dup) | |
100 | print() | |
101 | idx += 1 | |
102 | ||
103 | ||
104 | if __name__ == "__main__": | |
105 | # parse command line options | |
106 | import argparse | |
107 | ||
108 | parser = argparse.ArgumentParser(\ | |
109 | description='Movie database filename duplicate checker') | |
110 | parser.add_argument('--ratio', type=float, default=0.85, \ | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
111 | help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
112 | parser.add_argument('--difflib', action='store_true', default=False, \ |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
113 | help='force the use of difflib instead Levenshtein') |
21 | 114 | parser.add_argument('basedir', metavar='basedir', nargs='+', \ |
115 | help='one or more base directories') | |
116 | ||
117 | args = parser.parse_args() | |
118 | dupe = dupechecker() | |
119 | dupe.ratio = args.ratio | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
120 | if args.difflib: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
121 | DIFFLIB = True |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
122 | import difflib |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
123 | else: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
124 | try: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
125 | import Levenshtein |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
126 | DIFFLIB = False |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
127 | except ImportError: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
128 | import difflib |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
129 | DIFFLIB = True |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
130 | print("Consider 'pip install python-Levenshtein' for faster analyze") |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
131 | |
21 | 132 | |
133 | for srcstr in args.basedir: | |
134 | dupe.scandir(srcstr) | |
135 | dupe.analyze() | |
136 | dupe.output() |