6 |
6 |
7 BASEDIR="../DREAMBOX" |
7 BASEDIR="../DREAMBOX" |
8 |
8 |
9 FILELIST=[] |
9 FILELIST=[] |
10 DUPLICATES={} |
10 DUPLICATES={} |
11 |
|
12 def similarity(seq1, seq2): |
|
13 #return difflib.SequenceMatcher(a=seq1.lower(), b=seq2.lower()).ratio() |
|
14 return difflib.SequenceMatcher(a=seq1, b=seq2).ratio() |
|
15 |
11 |
16 print("Reading files...") |
12 print("Reading files...") |
17 for root, subdirs, files in os.walk(BASEDIR): |
13 for root, subdirs, files in os.walk(BASEDIR): |
18 for filename in files: |
14 for filename in files: |
19 if filename.endswith(".ts"): |
15 if filename.endswith(".ts"): |
28 elif filename.endswith(".mkv"): |
24 elif filename.endswith(".mkv"): |
29 title = filename[:-4].lower() |
25 title = filename[:-4].lower() |
30 FILELIST.append([title, filename, root]) |
26 FILELIST.append([title, filename, root]) |
31 print("%i files found, running duplicate testing loop" % len(FILELIST)) |
27 print("%i files found, running duplicate testing loop" % len(FILELIST)) |
32 |
28 |
33 |
29 listlen = len(FILELIST) |
34 for idx, item in enumerate(FILELIST): |
30 for idx in range(listlen): |
35 comparelist = FILELIST[idx+1:] |
31 if not FILELIST[idx]: |
36 print("%d %s\033[K\r" % (idx, item[0]), |
32 continue |
|
33 print("\r%d %s\033[K" % (idx, FILELIST[idx][0]), |
37 end='') |
34 end='') |
38 sys.stdout.flush() |
35 sys.stdout.flush() |
39 for idx2, item2 in enumerate(comparelist): |
36 for idx2 in range(idx + 1, listlen): |
40 if similarity(item[0], item2[0]) > 0.85: |
37 if FILELIST[idx2] and difflib.SequenceMatcher(a = FILELIST[idx][0], b = FILELIST[idx2][0]).ratio() > 0.85: |
41 #print "possible duplicate %d %s" % (idx2, item2[0]) |
38 #print "possible duplicate %d %s" % (idx2, item2[0]) |
42 key = os.path.join(item[2], item[1]) |
39 key = os.path.join(FILELIST[idx][2], FILELIST[idx][1]) |
43 if not key in DUPLICATES.keys(): |
40 if not key in DUPLICATES.keys(): |
44 DUPLICATES[key] = [] |
41 DUPLICATES[key] = [] |
45 DUPLICATES[key].append( |
42 DUPLICATES[key].append( |
46 os.path.join(item2[2], item2[1])) |
43 os.path.join(FILELIST[idx2][2], FILELIST[idx2][1])) |
|
44 # unset the found duplicate, so that this will not be scanned again |
|
45 FILELIST[idx2] = None |
47 |
46 |
48 print("\n\n\n") |
47 print("\n\n\n") |
49 idx = 1 |
48 idx = 1 |
50 for base in DUPLICATES.keys(): |
49 for base in DUPLICATES.keys(): |
51 print("Duplicate file set #%i" % idx) |
50 print("Duplicate file set #%i" % idx) |