dupecheck.py

Sat, 25 Nov 2017 16:51:08 +0100

author
mdd
date
Sat, 25 Nov 2017 16:51:08 +0100
changeset 14
b398ae388328
parent 5
51e57e9f8db1
child 15
82361ad7b3fe
permissions
-rwxr-xr-x

added support for dvb_teletext subtitles

3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
1 #!/usr/bin/env python
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
2
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
3 from __future__ import print_function
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
4 import difflib
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
5 import os, sys
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
6
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
7 BASEDIR="../DREAMBOX"
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
8
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
9 FILELIST=[]
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
10 DUPLICATES={}
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
11
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
12 print("Reading files...")
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
13 for root, subdirs, files in os.walk(BASEDIR):
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
14 for filename in files:
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
15 if filename.endswith(".ts"):
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
16 file_path = os.path.join(root, filename)
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
17 title = filename.split(" - ")
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
18 if len(title) == 1:
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
19 title = title[0]
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
20 else:
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
21 title = " - ".join(title[2:])
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
22 title = title[:-3].lower()
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
23 FILELIST.append([title, filename, root])
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
24 elif filename.endswith(".mkv"):
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
25 title = filename[:-4].lower()
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
26 FILELIST.append([title, filename, root])
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
27 print("%i files found, running duplicate testing loop" % len(FILELIST))
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
28
5
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
29 listlen = len(FILELIST)
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
30 for idx in range(listlen):
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
31 if not FILELIST[idx]:
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
32 continue
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
33 print("\r%d %s\033[K" % (idx, FILELIST[idx][0]),
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
34 end='')
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
35 sys.stdout.flush()
5
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
36 for idx2 in range(idx + 1, listlen):
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
37 if FILELIST[idx2] and difflib.SequenceMatcher(a = FILELIST[idx][0], b = FILELIST[idx2][0]).ratio() > 0.85:
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
38 #print "possible duplicate %d %s" % (idx2, item2[0])
5
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
39 key = os.path.join(FILELIST[idx][2], FILELIST[idx][1])
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
40 if not key in DUPLICATES.keys():
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
41 DUPLICATES[key] = []
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
42 DUPLICATES[key].append(
5
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
43 os.path.join(FILELIST[idx2][2], FILELIST[idx2][1]))
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
44 # unset the found duplicate, so that this will not be scanned again
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
45 FILELIST[idx2] = None
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
46
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
47 print("\n\n\n")
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
48 idx = 1
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
49 for base in DUPLICATES.keys():
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
50 print("Duplicate file set #%i" % idx)
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
51 print(base)
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
52 for dup in DUPLICATES[base]:
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
53 print(dup)
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
54 print()
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
55 idx += 1

mercurial