dupecheck.py

Sat, 04 Nov 2017 22:34:12 +0100

author
mdd
date
Sat, 04 Nov 2017 22:34:12 +0100
changeset 3
569fa9a431b9
child 4
a7e9e7974c22
permissions
-rwxr-xr-x

added filename duplicate checker

#!/usr/bin/env python

import difflib
import os, sys

BASEDIR="../DREAMBOX"

FILELIST=[]
DUPLICATES={}

def similarity(seq1, seq2):
    #return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio()
    return difflib.SequenceMatcher(a=seq1, b=seq2).ratio()

for root, subdirs, files in os.walk(BASEDIR):
    for filename in files:
        if filename.endswith(".ts"):
            file_path = os.path.join(root, filename)
            title = filename.split(" - ")
            if len(title) == 1:
                title = title[0]
            else:
                title = " - ".join(title[2:])
            title = title[:-3].lower()
            FILELIST.append([title, filename, root])
        elif filename.endswith(".mkv"):
            title = filename[:-4].lower()
            FILELIST.append([title, filename, root])


for idx, item in enumerate(FILELIST):
    comparelist = FILELIST[idx+1:]
    #print "%d %s (%d to compare)" % (idx, item[0], len(comparelist))
    for idx2, item2 in enumerate(comparelist):
        if similarity(item[0], item2[0]) > 0.85:
            #print "possible duplicate %d %s" % (idx2, item2[0])
            key = os.path.join(item[2], item[1])
            if not key in DUPLICATES.keys():
                DUPLICATES[key] = []
            DUPLICATES[key].append(
                os.path.join(item2[2], item2[1]))

for base in DUPLICATES.keys():
    print base
    for dup in DUPLICATES[base]:
        print dup
    print ""

mercurial