dreambox_tools: comparison dupecheck.py

-:df89a8fba2a2
+:83bcb5931ee3
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
 """
 Toolkit / executable to scan for duplicate filenames in movie database
 2017 by mdd
 """
 while nbytes >= 1024 and i < len(suffixes)-1:
 nbytes /= 1024.
 i += 1
 f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
 return '%s %s' % (f, suffixes[i])
+def replace_all(text, dic):
+for i, j in dic.iteritems():
+text = text.replace(i, j)
+return text
 class dupechecker(object):
 """
 Simple class to scan multiple directories recursive,
 build a list of movie filenames.
 def reset(self):
 self.filelist = []
 self.duplicates = {}
-def scandir(self, basedir):
+def scandir(self, basedir, extra=[]):
 """
 Scan a base directory for movie files and add them to
 the list for analyze
 """
 self.basedir = basedir
 title = title[:-3].lower()
 self.filelist.append([title, filename, root, ext])
 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
 title = filename[:-4].lower()
 self.filelist.append([title, filename, root, ext])
+elif ext in extra:
+title = filename[:-4].lower()
+self.filelist.append([title, filename, root, ext])
+def fixnames(self):
+"""
+Search for defect filenames and remove illegal characters
+"""
+import re
+for item in self.filelist:
+if not item[3] in ['.mkv', '.txt']:
+continue
+# any non-alphanumeric characters in filename?
+cleanfn = replace_all(item[1], {
+#'ä':'ae', 'Ä':'Ae',
+#'ö':'oe', 'Ö':'Oe',
+#'ü':'ue', 'Ü':'Ue',
+'ß':'ss',
+})
+cleanfn = re.sub(r'[^A-Za-z0-9\.\_\-\(\)\&öäüÖÄÜ\' ]', '-', cleanfn)
+if item[1] == cleanfn:
+continue
+print (item[1])
+os.rename(
+os.path.join(item[2], item[1]),
+os.path.join(item[2], cleanfn)
+)
 def statistics(self):
 """
 Summarize disk usage and print stats about found filetypes
 """
 item[2], item[1])).st_size
 print ("%5s %6s %10s" % (
 "File:",
 "Count:",
 "Size:"))
+sum_count = 0
+sum_size = 0.0
 for ext in stats.keys():
+sum_count += stats[ext][0]
+sum_size += stats[ext][1]
 print ("%5s %6i %10s" % (
 ext, stats[ext][0],
 humansize(stats[ext][1])))
+print ("%5s %6i %10s" % (
+"TOTAL", sum_count,
+humansize(sum_size)))
 def analyze(self):
 """
 Analyze the scanlist for duplicates
 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)')
 parser.add_argument('--difflib', action='store_true', default=False, \
 help='force the use of difflib instead Levenshtein')
 parser.add_argument('--stats', action='store_true', default=False, \
 help='generate stats summary instead of check for duplicates')
+parser.add_argument('--fixnames', action='store_true', default=False, \
+help='scan for mkv and txt, fix broken filenames for windows')
 parser.add_argument('basedir', metavar='basedir', nargs='+', \
 help='one or more base directories')
 args = parser.parse_args()
 dupe = dupechecker()
 import difflib
 DIFFLIB = True
 print("Consider 'pip install python-Levenshtein' for faster analyze")
+if args.fixnames:
+for srcstr in args.basedir:
+dupe.scandir(srcstr, ['.txt'])
+if len(dupe.filelist) > 0:
+print ("Checking %i file names..." % len(dupe.filelist))
+dupe.fixnames()
+dupe.filelist = []
+sys.exit(0)
 for srcstr in args.basedir:
 dupe.scandir(srcstr)
-if args.stats:
+if args.stats or args.fixnames:
 dupe.statistics()
 else:
 dupe.analyze()
 dupe.output()

Mercurial > hg-public > dreambox_tools / file comparison

comparison: dupecheck.py

dupecheck.py