Sat, 04 Nov 2017 22:34:12 +0100
added filename duplicate checker
.hgignore | file | annotate | diff | comparison | revisions | |
dupecheck.py | file | annotate | diff | comparison | revisions |
--- a/.hgignore Sat Nov 04 22:30:19 2017 +0100 +++ b/.hgignore Sat Nov 04 22:34:12 2017 +0100 @@ -1,4 +1,5 @@ syntax: glob *.pyc -eit.old/* \ No newline at end of file +eit.old/* +testfiles/*
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/dupecheck.py Sat Nov 04 22:34:12 2017 +0100 @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +import difflib +import os, sys + +BASEDIR="../DREAMBOX" + +FILELIST=[] +DUPLICATES={} + +def similarity(seq1, seq2): + #return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio() + return difflib.SequenceMatcher(a=seq1, b=seq2).ratio() + +for root, subdirs, files in os.walk(BASEDIR): + for filename in files: + if filename.endswith(".ts"): + file_path = os.path.join(root, filename) + title = filename.split(" - ") + if len(title) == 1: + title = title[0] + else: + title = " - ".join(title[2:]) + title = title[:-3].lower() + FILELIST.append([title, filename, root]) + elif filename.endswith(".mkv"): + title = filename[:-4].lower() + FILELIST.append([title, filename, root]) + + +for idx, item in enumerate(FILELIST): + comparelist = FILELIST[idx+1:] + #print "%d %s (%d to compare)" % (idx, item[0], len(comparelist)) + for idx2, item2 in enumerate(comparelist): + if similarity(item[0], item2[0]) > 0.85: + #print "possible duplicate %d %s" % (idx2, item2[0]) + key = os.path.join(item[2], item[1]) + if not key in DUPLICATES.keys(): + DUPLICATES[key] = [] + DUPLICATES[key].append( + os.path.join(item2[2], item2[1])) + +for base in DUPLICATES.keys(): + print base + for dup in DUPLICATES[base]: + print dup + print "" \ No newline at end of file