dupecheck.py

changeset 22
c18abd9198c0
parent 21
1c0beeca2f9c
child 32
df89a8fba2a2
equal deleted inserted replaced
21:1c0beeca2f9c 22:c18abd9198c0
7 7
8 #pylint: disable=line-too-long 8 #pylint: disable=line-too-long
9 #pylint: disable=invalid-name 9 #pylint: disable=invalid-name
10 10
11 from __future__ import print_function 11 from __future__ import print_function
12 import difflib
13 import os, sys 12 import os, sys
13
14 def similarity(a, b):
15 if DIFFLIB:
16 return difflib.SequenceMatcher(a=a, b=b).ratio()
17 else:
18 return Levenshtein.ratio(a, b)
14 19
15 class dupechecker(object): 20 class dupechecker(object):
16 """ 21 """
17 Simple class to scan multiple directories recursive, 22 Simple class to scan multiple directories recursive,
18 build a list of movie filenames. 23 build a list of movie filenames.
21 def __init__(self): 26 def __init__(self):
22 self.basedir = "" 27 self.basedir = ""
23 self.filelist = [] 28 self.filelist = []
24 self.duplicates = {} 29 self.duplicates = {}
25 self.ratio = 0.85 30 self.ratio = 0.85
31
26 32
27 def reset(self): 33 def reset(self):
28 self.filelist = [] 34 self.filelist = []
29 self.duplicates = {} 35 self.duplicates = {}
30 36
65 print("\r%d %s\033[K" % ( 71 print("\r%d %s\033[K" % (
66 idx, self.filelist[idx][0]), end='') 72 idx, self.filelist[idx][0]), end='')
67 sys.stdout.flush() 73 sys.stdout.flush()
68 for idx2 in range(idx + 1, listlen): 74 for idx2 in range(idx + 1, listlen):
69 if self.filelist[idx2]: 75 if self.filelist[idx2]:
70 if difflib.SequenceMatcher(a=self.filelist[idx][0], b=self.filelist[idx2][0]).ratio() > self.ratio: 76 if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio:
71 #print "possible duplicate %d %s" % (idx2, item2[0]) 77 #print "possible duplicate %d %s" % (idx2, item2[0])
72 key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) 78 key = os.path.join(self.filelist[idx][2], self.filelist[idx][1])
73 if not key in self.duplicates.keys(): 79 if not key in self.duplicates:
74 self.duplicates[key] = [] 80 self.duplicates[key] = []
75 self.duplicates[key].append( 81 self.duplicates[key].append(
76 os.path.join( 82 os.path.join(
77 self.filelist[idx2][2], 83 self.filelist[idx2][2],
78 self.filelist[idx2][1] 84 self.filelist[idx2][1]
79 )) 85 ))
80 # unset the found duplicate, so that this will not be scanned again 86 # unset the found duplicate, so that this will not be scanned again
81 self.filelist[idx2] = None 87 self.filelist[idx2] = None
82 print("\n\n\n") 88 print("\n\n")
83 89
84 def output(self): 90 def output(self):
85 """ 91 """
86 Dump found duplicates to console 92 Dump found duplicates to console
87 """ 93 """
88 idx = 1 94 idx = 1
89 for base in self.duplicates.keys(): 95 for base in self.duplicates:
90 print("Duplicate file set #%i" % idx) 96 print("Duplicate file set #%i" % idx)
91 print(base) 97 print(base)
92 for dup in self.duplicates[base]: 98 for dup in self.duplicates[base]:
93 print(dup) 99 print(dup)
94 print() 100 print()
100 import argparse 106 import argparse
101 107
102 parser = argparse.ArgumentParser(\ 108 parser = argparse.ArgumentParser(\
103 description='Movie database filename duplicate checker') 109 description='Movie database filename duplicate checker')
104 parser.add_argument('--ratio', type=float, default=0.85, \ 110 parser.add_argument('--ratio', type=float, default=0.85, \
105 help='filename duplicate threshold 0.1 < ratio 1.0') 111 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)')
112 parser.add_argument('--difflib', action='store_true', default=False, \
113 help='force the use of difflib instead Levenshtein')
106 parser.add_argument('basedir', metavar='basedir', nargs='+', \ 114 parser.add_argument('basedir', metavar='basedir', nargs='+', \
107 help='one or more base directories') 115 help='one or more base directories')
108 116
109 args = parser.parse_args() 117 args = parser.parse_args()
110 dupe = dupechecker() 118 dupe = dupechecker()
111 dupe.ratio = args.ratio 119 dupe.ratio = args.ratio
120 if args.difflib:
121 DIFFLIB = True
122 import difflib
123 else:
124 try:
125 import Levenshtein
126 DIFFLIB = False
127 except ImportError:
128 import difflib
129 DIFFLIB = True
130 print("Consider 'pip install python-Levenshtein' for faster analyze")
131
112 132
113 for srcstr in args.basedir: 133 for srcstr in args.basedir:
114 dupe.scandir(srcstr) 134 dupe.scandir(srcstr)
115 dupe.analyze() 135 dupe.analyze()
116 dupe.output() 136 dupe.output()

mercurial