dupecheck.py

Sat, 04 Nov 2017 23:12:37 +0100

author
mdd
date
Sat, 04 Nov 2017 23:12:37 +0100
changeset 5
51e57e9f8db1
parent 4
a7e9e7974c22
child 15
82361ad7b3fe
permissions
-rwxr-xr-x

little speed tuning done

#!/usr/bin/env python

from __future__ import print_function
import difflib
import os, sys

BASEDIR="../DREAMBOX"

FILELIST=[]
DUPLICATES={}

print("Reading files...")
for root, subdirs, files in os.walk(BASEDIR):
    for filename in files:
        if filename.endswith(".ts"):
            file_path = os.path.join(root, filename)
            title = filename.split(" - ")
            if len(title) == 1:
                title = title[0]
            else:
                title = " - ".join(title[2:])
            title = title[:-3].lower()
            FILELIST.append([title, filename, root])
        elif filename.endswith(".mkv"):
            title = filename[:-4].lower()
            FILELIST.append([title, filename, root])
print("%i files found, running duplicate testing loop" % len(FILELIST))

listlen = len(FILELIST)
for idx in range(listlen):
    if not FILELIST[idx]:
        continue
    print("\r%d %s\033[K" % (idx, FILELIST[idx][0]),
        end='')
    sys.stdout.flush()
    for idx2 in range(idx + 1, listlen):
        if FILELIST[idx2] and difflib.SequenceMatcher(a = FILELIST[idx][0], b = FILELIST[idx2][0]).ratio() > 0.85:
            #print "possible duplicate %d %s" % (idx2, item2[0])
            key = os.path.join(FILELIST[idx][2], FILELIST[idx][1])
            if not key in DUPLICATES.keys():
                DUPLICATES[key] = []
            DUPLICATES[key].append(
                os.path.join(FILELIST[idx2][2], FILELIST[idx2][1]))
            # unset the found duplicate, so that this will not be scanned again
            FILELIST[idx2] = None

print("\n\n\n")
idx = 1
for base in DUPLICATES.keys():
    print("Duplicate file set #%i" % idx)
    print(base)
    for dup in DUPLICATES[base]:
        print(dup)
    print()
    idx += 1

mercurial