7 |
7 |
8 #pylint: disable=line-too-long |
8 #pylint: disable=line-too-long |
9 #pylint: disable=invalid-name |
9 #pylint: disable=invalid-name |
10 |
10 |
11 from __future__ import print_function |
11 from __future__ import print_function |
12 import difflib |
|
13 import os, sys |
12 import os, sys |
|
13 |
|
14 def similarity(a, b): |
|
15 if DIFFLIB: |
|
16 return difflib.SequenceMatcher(a=a, b=b).ratio() |
|
17 else: |
|
18 return Levenshtein.ratio(a, b) |
14 |
19 |
15 class dupechecker(object): |
20 class dupechecker(object): |
16 """ |
21 """ |
17 Simple class to scan multiple directories recursive, |
22 Simple class to scan multiple directories recursive, |
18 build a list of movie filenames. |
23 build a list of movie filenames. |
65 print("\r%d %s\033[K" % ( |
71 print("\r%d %s\033[K" % ( |
66 idx, self.filelist[idx][0]), end='') |
72 idx, self.filelist[idx][0]), end='') |
67 sys.stdout.flush() |
73 sys.stdout.flush() |
68 for idx2 in range(idx + 1, listlen): |
74 for idx2 in range(idx + 1, listlen): |
69 if self.filelist[idx2]: |
75 if self.filelist[idx2]: |
70 if difflib.SequenceMatcher(a=self.filelist[idx][0], b=self.filelist[idx2][0]).ratio() > self.ratio: |
76 if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio: |
71 #print "possible duplicate %d %s" % (idx2, item2[0]) |
77 #print "possible duplicate %d %s" % (idx2, item2[0]) |
72 key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) |
78 key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) |
73 if not key in self.duplicates.keys(): |
79 if not key in self.duplicates: |
74 self.duplicates[key] = [] |
80 self.duplicates[key] = [] |
75 self.duplicates[key].append( |
81 self.duplicates[key].append( |
76 os.path.join( |
82 os.path.join( |
77 self.filelist[idx2][2], |
83 self.filelist[idx2][2], |
78 self.filelist[idx2][1] |
84 self.filelist[idx2][1] |
79 )) |
85 )) |
80 # unset the found duplicate, so that this will not be scanned again |
86 # unset the found duplicate, so that this will not be scanned again |
81 self.filelist[idx2] = None |
87 self.filelist[idx2] = None |
82 print("\n\n\n") |
88 print("\n\n") |
83 |
89 |
84 def output(self): |
90 def output(self): |
85 """ |
91 """ |
86 Dump found duplicates to console |
92 Dump found duplicates to console |
87 """ |
93 """ |
88 idx = 1 |
94 idx = 1 |
89 for base in self.duplicates.keys(): |
95 for base in self.duplicates: |
90 print("Duplicate file set #%i" % idx) |
96 print("Duplicate file set #%i" % idx) |
91 print(base) |
97 print(base) |
92 for dup in self.duplicates[base]: |
98 for dup in self.duplicates[base]: |
93 print(dup) |
99 print(dup) |
94 print() |
100 print() |
100 import argparse |
106 import argparse |
101 |
107 |
102 parser = argparse.ArgumentParser(\ |
108 parser = argparse.ArgumentParser(\ |
103 description='Movie database filename duplicate checker') |
109 description='Movie database filename duplicate checker') |
104 parser.add_argument('--ratio', type=float, default=0.85, \ |
110 parser.add_argument('--ratio', type=float, default=0.85, \ |
105 help='filename duplicate threshold 0.1 < ratio 1.0') |
111 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') |
|
112 parser.add_argument('--difflib', action='store_true', default=False, \ |
|
113 help='force the use of difflib instead Levenshtein') |
106 parser.add_argument('basedir', metavar='basedir', nargs='+', \ |
114 parser.add_argument('basedir', metavar='basedir', nargs='+', \ |
107 help='one or more base directories') |
115 help='one or more base directories') |
108 |
116 |
109 args = parser.parse_args() |
117 args = parser.parse_args() |
110 dupe = dupechecker() |
118 dupe = dupechecker() |
111 dupe.ratio = args.ratio |
119 dupe.ratio = args.ratio |
|
120 if args.difflib: |
|
121 DIFFLIB = True |
|
122 import difflib |
|
123 else: |
|
124 try: |
|
125 import Levenshtein |
|
126 DIFFLIB = False |
|
127 except ImportError: |
|
128 import difflib |
|
129 DIFFLIB = True |
|
130 print("Consider 'pip install python-Levenshtein' for faster analyze") |
|
131 |
112 |
132 |
113 for srcstr in args.basedir: |
133 for srcstr in args.basedir: |
114 dupe.scandir(srcstr) |
134 dupe.scandir(srcstr) |
115 dupe.analyze() |
135 dupe.analyze() |
116 dupe.output() |
136 dupe.output() |