dupecheck.py

Sat, 04 Nov 2017 22:52:01 +0100

author
mdd
date
Sat, 04 Nov 2017 22:52:01 +0100
changeset 4
a7e9e7974c22
parent 3
569fa9a431b9
child 5
51e57e9f8db1
permissions
-rwxr-xr-x

prepare for speedup

#!/usr/bin/env python

from __future__ import print_function
import difflib
import os, sys

BASEDIR="../DREAMBOX"

FILELIST=[]
DUPLICATES={}

def similarity(seq1, seq2):
    #return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio()
    return difflib.SequenceMatcher(a=seq1, b=seq2).ratio()

print("Reading files...")
for root, subdirs, files in os.walk(BASEDIR):
    for filename in files:
        if filename.endswith(".ts"):
            file_path = os.path.join(root, filename)
            title = filename.split(" - ")
            if len(title) == 1:
                title = title[0]
            else:
                title = " - ".join(title[2:])
            title = title[:-3].lower()
            FILELIST.append([title, filename, root])
        elif filename.endswith(".mkv"):
            title = filename[:-4].lower()
            FILELIST.append([title, filename, root])
print("%i files found, running duplicate testing loop" % len(FILELIST))


for idx, item in enumerate(FILELIST):
    comparelist = FILELIST[idx+1:]
    print("%d %s\033[K\r" % (idx, item[0]),
        end='')
    sys.stdout.flush()
    for idx2, item2 in enumerate(comparelist):
        if similarity(item[0], item2[0]) > 0.85:
            #print "possible duplicate %d %s" % (idx2, item2[0])
            key = os.path.join(item[2], item[1])
            if not key in DUPLICATES.keys():
                DUPLICATES[key] = []
            DUPLICATES[key].append(
                os.path.join(item2[2], item2[1]))

print("\n\n\n")
idx = 1
for base in DUPLICATES.keys():
    print("Duplicate file set #%i" % idx)
    print(base)
    for dup in DUPLICATES[base]:
        print(dup)
    print()
    idx += 1

mercurial