dupecheck.py

changeset 32
df89a8fba2a2
parent 22
c18abd9198c0
child 33
83bcb5931ee3
equal deleted inserted replaced
31:52371bbcde5c 32:df89a8fba2a2
14 def similarity(a, b): 14 def similarity(a, b):
15 if DIFFLIB: 15 if DIFFLIB:
16 return difflib.SequenceMatcher(a=a, b=b).ratio() 16 return difflib.SequenceMatcher(a=a, b=b).ratio()
17 else: 17 else:
18 return Levenshtein.ratio(a, b) 18 return Levenshtein.ratio(a, b)
19
20 suffixes = ['b', 'K', 'M', 'G', 'T', 'P']
21 def humansize(nbytes):
22 i = 0
23 while nbytes >= 1024 and i < len(suffixes)-1:
24 nbytes /= 1024.
25 i += 1
26 f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
27 return '%s %s' % (f, suffixes[i])
19 28
20 class dupechecker(object): 29 class dupechecker(object):
21 """ 30 """
22 Simple class to scan multiple directories recursive, 31 Simple class to scan multiple directories recursive,
23 build a list of movie filenames. 32 build a list of movie filenames.
50 if len(title) == 1: 59 if len(title) == 1:
51 title = title[0] 60 title = title[0]
52 else: 61 else:
53 title = " - ".join(title[2:]) 62 title = " - ".join(title[2:])
54 title = title[:-3].lower() 63 title = title[:-3].lower()
55 self.filelist.append([title, filename, root]) 64 self.filelist.append([title, filename, root, ext])
56 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: 65 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
57 title = filename[:-4].lower() 66 title = filename[:-4].lower()
58 self.filelist.append([title, filename, root]) 67 self.filelist.append([title, filename, root, ext])
68
69 def statistics(self):
70 """
71 Summarize disk usage and print stats about found filetypes
72 """
73 stats = {}
74 for item in self.filelist:
75 if not item[3] in stats:
76 stats[item[3]] = [0, 0.0]
77 stats[item[3]][0] += 1
78 stats[item[3]][1] += os.stat(
79 os.path.join(
80 item[2], item[1])).st_size
81 print ("%5s %6s %10s" % (
82 "File:",
83 "Count:",
84 "Size:"))
85 for ext in stats.keys():
86 print ("%5s %6i %10s" % (
87 ext, stats[ext][0],
88 humansize(stats[ext][1])))
89
59 90
60 def analyze(self): 91 def analyze(self):
61 """ 92 """
62 Analyze the scanlist for duplicates 93 Analyze the scanlist for duplicates
63 """ 94 """
109 description='Movie database filename duplicate checker') 140 description='Movie database filename duplicate checker')
110 parser.add_argument('--ratio', type=float, default=0.85, \ 141 parser.add_argument('--ratio', type=float, default=0.85, \
111 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') 142 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)')
112 parser.add_argument('--difflib', action='store_true', default=False, \ 143 parser.add_argument('--difflib', action='store_true', default=False, \
113 help='force the use of difflib instead Levenshtein') 144 help='force the use of difflib instead Levenshtein')
145 parser.add_argument('--stats', action='store_true', default=False, \
146 help='generate stats summary instead of check for duplicates')
114 parser.add_argument('basedir', metavar='basedir', nargs='+', \ 147 parser.add_argument('basedir', metavar='basedir', nargs='+', \
115 help='one or more base directories') 148 help='one or more base directories')
116 149
117 args = parser.parse_args() 150 args = parser.parse_args()
118 dupe = dupechecker() 151 dupe = dupechecker()
130 print("Consider 'pip install python-Levenshtein' for faster analyze") 163 print("Consider 'pip install python-Levenshtein' for faster analyze")
131 164
132 165
133 for srcstr in args.basedir: 166 for srcstr in args.basedir:
134 dupe.scandir(srcstr) 167 dupe.scandir(srcstr)
135 dupe.analyze() 168 if args.stats:
136 dupe.output() 169 dupe.statistics()
170 else:
171 dupe.analyze()
172 dupe.output()

mercurial