dupecheck.py

Tue, 28 Nov 2017 22:27:01 +0100

author
mdd
date
Tue, 28 Nov 2017 22:27:01 +0100
changeset 17
842120f00078
parent 15
82361ad7b3fe
child 21
1c0beeca2f9c
permissions
-rwxr-xr-x

code cleanup

3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
1 #!/usr/bin/env python
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
2
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
3 from __future__ import print_function
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
4 import difflib
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
5 import os, sys
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
6
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
7 BASEDIR="../DREAMBOX"
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
8
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
9 FILELIST=[]
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
10 DUPLICATES={}
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
11
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
12 print("Reading files...")
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
13 for root, subdirs, files in os.walk(BASEDIR):
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
14 for filename in files:
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
15 if filename.endswith(".ts"):
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
16 file_path = os.path.join(root, filename)
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
17 title = filename.split(" - ")
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
18 if len(title) == 1:
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
19 title = title[0]
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
20 else:
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
21 title = " - ".join(title[2:])
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
22 title = title[:-3].lower()
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
23 FILELIST.append([title, filename, root])
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
24 elif filename.endswith(".mkv"):
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
25 title = filename[:-4].lower()
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
26 FILELIST.append([title, filename, root])
15
82361ad7b3fe some changes, also implemented ffmpeg progress info and added force overwrite mode
mdd
parents: 5
diff changeset
27 elif filename.endswith(".mp4"):
82361ad7b3fe some changes, also implemented ffmpeg progress info and added force overwrite mode
mdd
parents: 5
diff changeset
28 title = filename[:-4].lower()
82361ad7b3fe some changes, also implemented ffmpeg progress info and added force overwrite mode
mdd
parents: 5
diff changeset
29 FILELIST.append([title, filename, root])
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
30 print("%i files found, running duplicate testing loop" % len(FILELIST))
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
31
5
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
32 listlen = len(FILELIST)
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
33 for idx in range(listlen):
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
34 if not FILELIST[idx]:
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
35 continue
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
36 print("\r%d %s\033[K" % (idx, FILELIST[idx][0]),
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
37 end='')
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
38 sys.stdout.flush()
5
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
39 for idx2 in range(idx + 1, listlen):
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
40 if FILELIST[idx2] and difflib.SequenceMatcher(a = FILELIST[idx][0], b = FILELIST[idx2][0]).ratio() > 0.85:
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
41 #print "possible duplicate %d %s" % (idx2, item2[0])
5
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
42 key = os.path.join(FILELIST[idx][2], FILELIST[idx][1])
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
43 if not key in DUPLICATES.keys():
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
44 DUPLICATES[key] = []
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
45 DUPLICATES[key].append(
5
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
46 os.path.join(FILELIST[idx2][2], FILELIST[idx2][1]))
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
47 # unset the found duplicate, so that this will not be scanned again
51e57e9f8db1 little speed tuning done
mdd
parents: 4
diff changeset
48 FILELIST[idx2] = None
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
49
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
50 print("\n\n\n")
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
51 idx = 1
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
52 for base in DUPLICATES.keys():
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
53 print("Duplicate file set #%i" % idx)
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
54 print(base)
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
55 for dup in DUPLICATES[base]:
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
56 print(dup)
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
57 print()
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
58 idx += 1

mercurial