# HG changeset patch # User mdd # Date 1511993092 -3600 # Node ID 1c0beeca2f9c3a260a4eba5a2397ad22a67fc050 # Parent 5b433bdd2023ef7ff9c64e5794438fa67bfcd9f6 cleanup dupechecker diff -r 5b433bdd2023 -r 1c0beeca2f9c dupecheck.py --- a/dupecheck.py Wed Nov 29 18:00:32 2017 +0100 +++ b/dupecheck.py Wed Nov 29 23:04:52 2017 +0100 @@ -1,58 +1,116 @@ #!/usr/bin/env python +""" +Toolkit / executable to scan for duplicate filenames in movie database + +2017 by mdd +""" + +#pylint: disable=line-too-long +#pylint: disable=invalid-name from __future__ import print_function import difflib import os, sys -BASEDIR="../DREAMBOX" +class dupechecker(object): + """ + Simple class to scan multiple directories recursive, + build a list of movie filenames. + analyze the list for duplicates and dump them + """ + def __init__(self): + self.basedir = "" + self.filelist = [] + self.duplicates = {} + self.ratio = 0.85 -FILELIST=[] -DUPLICATES={} + def reset(self): + self.filelist = [] + self.duplicates = {} -print("Reading files...") -for root, subdirs, files in os.walk(BASEDIR): - for filename in files: - if filename.endswith(".ts"): - file_path = os.path.join(root, filename) - title = filename.split(" - ") - if len(title) == 1: - title = title[0] - else: - title = " - ".join(title[2:]) - title = title[:-3].lower() - FILELIST.append([title, filename, root]) - elif filename.endswith(".mkv"): - title = filename[:-4].lower() - FILELIST.append([title, filename, root]) - elif filename.endswith(".mp4"): - title = filename[:-4].lower() - FILELIST.append([title, filename, root]) -print("%i files found, running duplicate testing loop" % len(FILELIST)) + def scandir(self, basedir): + """ + Scan a base directory for movie files and add them to + the list for analyze + """ + self.basedir = basedir + print("Scanning directory: %s" % basedir) + for root, subdirs, files in os.walk(basedir): + for filename in files: + ext = os.path.splitext(filename)[1].lower() + if ext == ".ts": + #file_path = os.path.join(root, filename) + title = filename.split(" - ") + if len(title) == 1: + title = title[0] + else: + title = " - ".join(title[2:]) + title = title[:-3].lower() + self.filelist.append([title, filename, root]) + elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: + title = filename[:-4].lower() + self.filelist.append([title, filename, root]) + + def analyze(self): + """ + Analyze the scanlist for duplicates + """ + print("%i files to analyze, running duplicate testing loop..." % ( + len(self.filelist))) -listlen = len(FILELIST) -for idx in range(listlen): - if not FILELIST[idx]: - continue - print("\r%d %s\033[K" % (idx, FILELIST[idx][0]), - end='') - sys.stdout.flush() - for idx2 in range(idx + 1, listlen): - if FILELIST[idx2] and difflib.SequenceMatcher(a = FILELIST[idx][0], b = FILELIST[idx2][0]).ratio() > 0.85: - #print "possible duplicate %d %s" % (idx2, item2[0]) - key = os.path.join(FILELIST[idx][2], FILELIST[idx][1]) - if not key in DUPLICATES.keys(): - DUPLICATES[key] = [] - DUPLICATES[key].append( - os.path.join(FILELIST[idx2][2], FILELIST[idx2][1])) - # unset the found duplicate, so that this will not be scanned again - FILELIST[idx2] = None + listlen = len(self.filelist) + for idx in range(listlen): + if not self.filelist[idx]: + continue + print("\r%d %s\033[K" % ( + idx, self.filelist[idx][0]), end='') + sys.stdout.flush() + for idx2 in range(idx + 1, listlen): + if self.filelist[idx2]: + if difflib.SequenceMatcher(a=self.filelist[idx][0], b=self.filelist[idx2][0]).ratio() > self.ratio: + #print "possible duplicate %d %s" % (idx2, item2[0]) + key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) + if not key in self.duplicates.keys(): + self.duplicates[key] = [] + self.duplicates[key].append( + os.path.join( + self.filelist[idx2][2], + self.filelist[idx2][1] + )) + # unset the found duplicate, so that this will not be scanned again + self.filelist[idx2] = None + print("\n\n\n") -print("\n\n\n") -idx = 1 -for base in DUPLICATES.keys(): - print("Duplicate file set #%i" % idx) - print(base) - for dup in DUPLICATES[base]: - print(dup) - print() - idx += 1 + def output(self): + """ + Dump found duplicates to console + """ + idx = 1 + for base in self.duplicates.keys(): + print("Duplicate file set #%i" % idx) + print(base) + for dup in self.duplicates[base]: + print(dup) + print() + idx += 1 + + +if __name__ == "__main__": + # parse command line options + import argparse + + parser = argparse.ArgumentParser(\ + description='Movie database filename duplicate checker') + parser.add_argument('--ratio', type=float, default=0.85, \ + help='filename duplicate threshold 0.1 < ratio 1.0') + parser.add_argument('basedir', metavar='basedir', nargs='+', \ + help='one or more base directories') + + args = parser.parse_args() + dupe = dupechecker() + dupe.ratio = args.ratio + + for srcstr in args.basedir: + dupe.scandir(srcstr) + dupe.analyze() + dupe.output()