dupecheck.py

changeset 5
51e57e9f8db1
parent 4
a7e9e7974c22
child 15
82361ad7b3fe
equal deleted inserted replaced
4:a7e9e7974c22 5:51e57e9f8db1
6 6
7 BASEDIR="../DREAMBOX" 7 BASEDIR="../DREAMBOX"
8 8
9 FILELIST=[] 9 FILELIST=[]
10 DUPLICATES={} 10 DUPLICATES={}
11
12 def similarity(seq1, seq2):
13 #return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio()
14 return difflib.SequenceMatcher(a=seq1, b=seq2).ratio()
15 11
16 print("Reading files...") 12 print("Reading files...")
17 for root, subdirs, files in os.walk(BASEDIR): 13 for root, subdirs, files in os.walk(BASEDIR):
18 for filename in files: 14 for filename in files:
19 if filename.endswith(".ts"): 15 if filename.endswith(".ts"):
28 elif filename.endswith(".mkv"): 24 elif filename.endswith(".mkv"):
29 title = filename[:-4].lower() 25 title = filename[:-4].lower()
30 FILELIST.append([title, filename, root]) 26 FILELIST.append([title, filename, root])
31 print("%i files found, running duplicate testing loop" % len(FILELIST)) 27 print("%i files found, running duplicate testing loop" % len(FILELIST))
32 28
33 29 listlen = len(FILELIST)
34 for idx, item in enumerate(FILELIST): 30 for idx in range(listlen):
35 comparelist = FILELIST[idx+1:] 31 if not FILELIST[idx]:
36 print("%d %s\033[K\r" % (idx, item[0]), 32 continue
33 print("\r%d %s\033[K" % (idx, FILELIST[idx][0]),
37 end='') 34 end='')
38 sys.stdout.flush() 35 sys.stdout.flush()
39 for idx2, item2 in enumerate(comparelist): 36 for idx2 in range(idx + 1, listlen):
40 if similarity(item[0], item2[0]) > 0.85: 37 if FILELIST[idx2] and difflib.SequenceMatcher(a = FILELIST[idx][0], b = FILELIST[idx2][0]).ratio() > 0.85:
41 #print "possible duplicate %d %s" % (idx2, item2[0]) 38 #print "possible duplicate %d %s" % (idx2, item2[0])
42 key = os.path.join(item[2], item[1]) 39 key = os.path.join(FILELIST[idx][2], FILELIST[idx][1])
43 if not key in DUPLICATES.keys(): 40 if not key in DUPLICATES.keys():
44 DUPLICATES[key] = [] 41 DUPLICATES[key] = []
45 DUPLICATES[key].append( 42 DUPLICATES[key].append(
46 os.path.join(item2[2], item2[1])) 43 os.path.join(FILELIST[idx2][2], FILELIST[idx2][1]))
44 # unset the found duplicate, so that this will not be scanned again
45 FILELIST[idx2] = None
47 46
48 print("\n\n\n") 47 print("\n\n\n")
49 idx = 1 48 idx = 1
50 for base in DUPLICATES.keys(): 49 for base in DUPLICATES.keys():
51 print("Duplicate file set #%i" % idx) 50 print("Duplicate file set #%i" % idx)

mercurial