Tue, 12 Dec 2017 01:26:51 +0100
Added tag V1.1 for changeset a2951f7c435e
#!/usr/bin/env python """ Toolkit / executable to scan for duplicate filenames in movie database 2017 by mdd """ #pylint: disable=line-too-long #pylint: disable=invalid-name from __future__ import print_function import os, sys def similarity(a, b): if DIFFLIB: return difflib.SequenceMatcher(a=a, b=b).ratio() else: return Levenshtein.ratio(a, b) class dupechecker(object): """ Simple class to scan multiple directories recursive, build a list of movie filenames. analyze the list for duplicates and dump them """ def __init__(self): self.basedir = "" self.filelist = [] self.duplicates = {} self.ratio = 0.85 def reset(self): self.filelist = [] self.duplicates = {} def scandir(self, basedir): """ Scan a base directory for movie files and add them to the list for analyze """ self.basedir = basedir print("Scanning directory: %s" % basedir) for root, subdirs, files in os.walk(basedir): for filename in files: ext = os.path.splitext(filename)[1].lower() if ext == ".ts": #file_path = os.path.join(root, filename) title = filename.split(" - ") if len(title) == 1: title = title[0] else: title = " - ".join(title[2:]) title = title[:-3].lower() self.filelist.append([title, filename, root]) elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: title = filename[:-4].lower() self.filelist.append([title, filename, root]) def analyze(self): """ Analyze the scanlist for duplicates """ print("%i files to analyze, running duplicate testing loop..." % ( len(self.filelist))) listlen = len(self.filelist) for idx in range(listlen): if not self.filelist[idx]: continue print("\r%d %s\033[K" % ( idx, self.filelist[idx][0]), end='') sys.stdout.flush() for idx2 in range(idx + 1, listlen): if self.filelist[idx2]: if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio: #print "possible duplicate %d %s" % (idx2, item2[0]) key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) if not key in self.duplicates: self.duplicates[key] = [] self.duplicates[key].append( os.path.join( self.filelist[idx2][2], self.filelist[idx2][1] )) # unset the found duplicate, so that this will not be scanned again self.filelist[idx2] = None print("\n\n") def output(self): """ Dump found duplicates to console """ idx = 1 for base in self.duplicates: print("Duplicate file set #%i" % idx) print(base) for dup in self.duplicates[base]: print(dup) print() idx += 1 if __name__ == "__main__": # parse command line options import argparse parser = argparse.ArgumentParser(\ description='Movie database filename duplicate checker') parser.add_argument('--ratio', type=float, default=0.85, \ help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') parser.add_argument('--difflib', action='store_true', default=False, \ help='force the use of difflib instead Levenshtein') parser.add_argument('basedir', metavar='basedir', nargs='+', \ help='one or more base directories') args = parser.parse_args() dupe = dupechecker() dupe.ratio = args.ratio if args.difflib: DIFFLIB = True import difflib else: try: import Levenshtein DIFFLIB = False except ImportError: import difflib DIFFLIB = True print("Consider 'pip install python-Levenshtein' for faster analyze") for srcstr in args.basedir: dupe.scandir(srcstr) dupe.analyze() dupe.output()