dupecheck.py

Mon, 09 Mar 2020 12:19:29 +0100

author
mdd
date
Mon, 09 Mar 2020 12:19:29 +0100
changeset 37
5be334b71b08
parent 36
a1ad6f4728be
permissions
-rw-r--r--

optimized fixnames for kodi

3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
1 #!/usr/bin/env python
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
2 # -*- coding: utf-8 -*-
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
3 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
4 Toolkit / executable to scan for duplicate filenames in movie database
37
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
5 More functions:
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
6 * sanitize filenames
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
7 * statistics
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
8
37
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
9 2017-2020 by mdd
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
10 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
11
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
12 #pylint: disable=line-too-long
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
13 #pylint: disable=invalid-name
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
14
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
15 from __future__ import print_function
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
16 import os, sys, re
36
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
17 import time
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
18
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
19 RE_PARENTHESES = re.compile("[\(\[].*?[\)\]]")
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
20
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
21 def similarity(a, b):
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
22 if DIFFLIB:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
23 return difflib.SequenceMatcher(a=a, b=b).ratio()
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
24 else:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
25 return Levenshtein.ratio(a, b)
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
26
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
27 suffixes = ['b', 'K', 'M', 'G', 'T', 'P']
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
28 def humansize(nbytes):
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
29 i = 0
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
30 while nbytes >= 1024 and i < len(suffixes)-1:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
31 nbytes /= 1024.
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
32 i += 1
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
33 f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
34 return '%s %s' % (f, suffixes[i])
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
35
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
36 def replace_all(text, dic):
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
37 for i, j in dic.iteritems():
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
38 text = text.replace(i, j)
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
39 return text
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
40
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
41 class dupechecker(object):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
42 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
43 Simple class to scan multiple directories recursive,
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
44 build a list of movie filenames.
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
45 analyze the list for duplicates and dump them
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
46 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
47 def __init__(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
48 self.basedir = ""
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
49 self.filelist = []
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
50 self.duplicates = {}
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
51 self.ratio = 0.85
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
52 self.ignore_fileprefix = []
36
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
53 self.ssh = None
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
54 self.ssh_data = None
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
55
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
56
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
57 def reset(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
58 self.filelist = []
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
59 self.duplicates = {}
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
60
36
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
61 def __scandir_files(self, root, files, extra=[]):
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
62 for filename in files:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
63 ext = os.path.splitext(filename)[1].lower()
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
64 if ext == ".ts":
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
65 #file_path = os.path.join(root, filename)
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
66 title = filename.split(" - ")
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
67 if len(title) == 1:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
68 title = title[0]
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
69 else:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
70 title = " - ".join(title[2:])
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
71 title = title[:-3].lower()
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
72
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
73 # remove parentheses with contents in title
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
74 title = RE_PARENTHESES.sub("", title)
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
75
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
76 self.filelist.append([title, filename, root, ext])
37
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
77 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4', '.ogg', '.mp3', '.iso']:
36
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
78 title = filename[:-4].lower()
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
79 title = RE_PARENTHESES.sub("", title)
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
80 self.filelist.append([title, filename, root, ext])
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
81 elif ext in extra:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
82 title = filename[:-4].lower()
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
83 title = RE_PARENTHESES.sub("", title)
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
84 self.filelist.append([title, filename, root, ext])
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
85
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
86
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
87 def scandir(self, basedir, extra=[]):
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
88 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
89 Scan a base directory for movie files and add them to
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
90 the list for analyze
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
91 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
92 self.basedir = basedir
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
93 print("Scanning directory: %s" % basedir)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
94 for root, subdirs, files in os.walk(basedir):
36
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
95 self.__scandir_files(root, files, extra)
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
96 # print(repr(self.filelist))
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
97 # sys.exit()
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
98
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
99 def scandir_remote(self, extra=[]):
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
100 """
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
101 connect to remote ssh servers and get file lists for duplicate check
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
102 """
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
103 print("getting filelist from remote hosts...")
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
104 try:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
105 from config import REMOTE_HOSTS
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
106 except ImportError:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
107 print("Please configure REMOTE_HOSTS in config.py!")
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
108 sys.exit(1)
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
109 try:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
110 import paramiko
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
111 self.ssh = paramiko.SSHClient()
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
112 #self.ssh.set_missing_host_key_policy(paramiko.WarningPolicy())
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
113 self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
114 #self.ssh_key = paramiko.RSAKey.from_private_key_file(SSH_PRIVATE_KEY_FILE)
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
115 except ImportError:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
116 print("Please install Paramiko!")
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
117 sys.exit(1)
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
118
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
119 for host in REMOTE_HOSTS:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
120 self.ssh_data = host
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
121
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
122 cleanlist = []
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
123 lst = self.__ssh_exec('cd %s; ls -1 *.ts' % self.ssh_data['basedir'])[0]
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
124 for item in lst:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
125 cleanlist.append(item.strip().encode('ascii','ignore'))
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
126 self.__scandir_files("%s: %s" % (
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
127 self.ssh_data['host'], self.ssh_data['basedir']), cleanlist)
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
128 # self.__scandir_files(self.ssh_data['basedir'], cleanlist)
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
129 self.__ssh_disconnect()
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
130
36
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
131 def __ssh_exec(self, command):
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
132 """
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
133 establish ssh connection and execute command
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
134 the connection remains open for following commands until ssh_disconnect is called
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
135 """
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
136 if self.ssh is None:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
137 return None
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
138 try:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
139 transport = self.ssh.get_transport()
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
140 if not transport or not transport.is_active():
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
141 print("SSH: connecting to %s" % self.ssh_data['host'])
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
142 self.ssh.connect(self.ssh_data['host'], self.ssh_data['port'], self.ssh_data['user'], self.ssh_data['pass'], self.ssh_data['key'])
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
143
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
144 # Send the command (non-blocking)
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
145 stdin, stdout, stderr = self.ssh.exec_command(command)
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
146
36
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
147 # Wait for the command to terminate
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
148 while not stdout.channel.exit_status_ready() and not stdout.channel.recv_ready():
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
149 time.sleep(1)
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
150
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
151 stdoutstring = stdout.readlines()
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
152 stderrstring = stderr.readlines()
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
153 return stdoutstring, stderrstring
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
154 finally:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
155 pass
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
156
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
157 def __ssh_disconnect(self):
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
158 """
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
159 check if ssh is connected and disconnect
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
160 """
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
161 if self.ssh is not None:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
162 # Close client connection.
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
163 transport = self.ssh.get_transport()
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
164 if not transport or not transport.is_active():
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
165 print("SSH: disconnecting")
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
166 self.ssh.close()
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
167
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
168 def fixnames(self):
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
169 """
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
170 Search for defect filenames and remove illegal characters
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
171 """
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
172 import re
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
173 for item in self.filelist:
37
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
174 if not item[3] in ['.mkv', '.txt', '.nfo']:
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
175 continue
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
176 # any non-alphanumeric characters in filename?
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
177 cleanfn = replace_all(item[1], {
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
178 #'ä':'ae', 'Ä':'Ae',
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
179 #'ö':'oe', 'Ö':'Oe',
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
180 #'ü':'ue', 'Ü':'Ue',
37
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
181 'ß': 'ss',
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
182 ':': ' -',
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
183 })
37
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
184 cleanfn = re.sub(r'[^A-Za-z0-9\.\_\-\(\)\[\]\{\}\&öäüÖÄÜ\' ]', '-', cleanfn)
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
185
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
186 # if cleanfn.endswith(".nfo"):
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
187 # # force .txt ending of nfo files
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
188 # # TODO: later there should nfo files for kodi
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
189 # cleanfn = cleanfn[:-4] + ".txt"
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
190
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
191 checklist = re.findall(r'\([^\(\)]+\)', cleanfn)
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
192 for nonyear in checklist:
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
193 if re.match(r'\(\d{4}\)', nonyear):
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
194 continue
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
195 cleanfn = replace_all(cleanfn, {
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
196 nonyear: replace_all(nonyear, {'(':'[', ')':']'})
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
197 })
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
198 #print ("NONYEAR: ", nonyear)
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
199
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
200 checklist = re.findall(r'\[\d{4}[^\]]+\]', cleanfn)
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
201 for year in checklist:
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
202 cleanfn = replace_all(cleanfn, {
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
203 year: replace_all(year, {
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
204 year[:5]: '(' + year[1:5] + ') ['
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
205 })
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
206 })
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
207 # print ("YEAR: ", year)
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
208
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
209
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
210 if item[1] == cleanfn:
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
211 continue
37
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
212 print (item[1], " -> ", cleanfn)
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
213 os.rename(
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
214 os.path.join(item[2], item[1]),
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
215 os.path.join(item[2], cleanfn)
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
216 )
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
217
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
218 def statistics(self):
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
219 """
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
220 Summarize disk usage and print stats about found filetypes
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
221 """
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
222 stats = {}
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
223 for item in self.filelist:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
224 if not item[3] in stats:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
225 stats[item[3]] = [0, 0.0]
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
226 stats[item[3]][0] += 1
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
227 stats[item[3]][1] += os.stat(
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
228 os.path.join(
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
229 item[2], item[1])).st_size
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
230 print ("%5s %6s %10s" % (
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
231 "File:",
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
232 "Count:",
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
233 "Size:"))
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
234 sum_count = 0
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
235 sum_size = 0.0
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
236 for ext in stats.keys():
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
237 sum_count += stats[ext][0]
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
238 sum_size += stats[ext][1]
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
239 print ("%5s %6i %10s" % (
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
240 ext, stats[ext][0],
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
241 humansize(stats[ext][1])))
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
242 print ("%5s %6i %10s" % (
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
243 "TOTAL", sum_count,
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
244 humansize(sum_size)))
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
245
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
246
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
247 def analyze(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
248 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
249 Analyze the scanlist for duplicates
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
250 """
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
251 listlen = len(self.filelist)
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
252 print("%i files to analyze, running duplicate testing loop..." % (
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
253 listlen))
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
254
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
255 # remove potentially unwanted entries from the list
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
256 if len(self.ignore_fileprefix) > 0:
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
257 for idx in reversed(range(listlen)):
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
258 for tst in self.ignore_fileprefix:
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
259 if tst == '':
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
260 continue
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
261 if self.filelist[idx][0].startswith(tst):
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
262 del self.filelist[idx]
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
263 break
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
264 listlen = len(self.filelist)
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
265
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
266 for idx in range(listlen):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
267 if not self.filelist[idx]:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
268 continue
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
269 print("\r%d %s\033[K" % (
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
270 idx, self.filelist[idx][0]), end='')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
271 sys.stdout.flush()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
272 for idx2 in range(idx + 1, listlen):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
273 if self.filelist[idx2]:
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
274 if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio:
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
275 #print "possible duplicate %d %s" % (idx2, item2[0])
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
276 key = os.path.join(self.filelist[idx][2], self.filelist[idx][1])
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
277 if not key in self.duplicates:
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
278 self.duplicates[key] = []
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
279 self.duplicates[key].append(
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
280 os.path.join(
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
281 self.filelist[idx2][2],
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
282 self.filelist[idx2][1]
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
283 ))
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
284 # unset the found duplicate, so that this will not be scanned again
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
285 self.filelist[idx2] = None
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
286 print("\n\n")
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
287
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
288 def output(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
289 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
290 Dump found duplicates to console
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
291 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
292 idx = 1
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
293 for base in self.duplicates:
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
294 print("Duplicate file set #%i" % idx)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
295 print(base)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
296 for dup in self.duplicates[base]:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
297 print(dup)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
298 print()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
299 idx += 1
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
300
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
301
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
302 if __name__ == "__main__":
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
303 # parse command line options
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
304 import argparse
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
305
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
306 parser = argparse.ArgumentParser(\
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
307 description='Movie database filename duplicate checker')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
308 parser.add_argument('--ratio', type=float, default=0.85, \
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
309 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)')
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
310 parser.add_argument('--difflib', action='store_true', default=False, \
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
311 help='force the use of difflib instead Levenshtein')
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
312 parser.add_argument('--stats', action='store_true', default=False, \
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
313 help='generate stats summary instead of check for duplicates')
36
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
314 parser.add_argument('--remote', action='store_true', default=False, \
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
315 help='Connect to ssh remotes, eg. dupecheck for dreambox local storage')
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
316 parser.add_argument('--fixnames', action='store_true', default=False, \
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
317 help='scan for mkv and txt, fix broken filenames for windows')
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
318 parser.add_argument('basedir', metavar='basedir', nargs='+', \
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
319 help='one or more base directories')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
320
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
321 args = parser.parse_args()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
322 dupe = dupechecker()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
323 dupe.ratio = args.ratio
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
324 if args.difflib:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
325 DIFFLIB = True
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
326 import difflib
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
327 else:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
328 try:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
329 import Levenshtein
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
330 DIFFLIB = False
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
331 except ImportError:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
332 import difflib
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
333 DIFFLIB = True
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
334 print("Consider 'pip install python-Levenshtein' for faster analyze")
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
335
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
336 if os.path.isfile("dupecheck-ignore.txt"):
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
337 # read the entire file line by line into buffer
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
338 print("Loading ignore filename prefixes file for dupe checking...")
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
339 dupe.ignore_fileprefix = [line.rstrip('\n').rstrip('\r') for line in open("dupecheck-ignore.txt", "rb")]
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
340
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
341 if args.fixnames:
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
342 for srcstr in args.basedir:
37
5be334b71b08 optimized fixnames for kodi
mdd
parents: 36
diff changeset
343 dupe.scandir(srcstr, ['.txt', '.nfo'])
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
344 if len(dupe.filelist) > 0:
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
345 print ("Checking %i file names..." % len(dupe.filelist))
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
346 dupe.fixnames()
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
347 dupe.filelist = []
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
348 sys.exit(0)
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
349
36
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
350 if args.remote:
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
351 dupe.scandir_remote()
a1ad6f4728be added support for remote ssh dupe checking against local basedir
mdd
parents: 35
diff changeset
352
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
353 for srcstr in args.basedir:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
354 dupe.scandir(srcstr)
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
355
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
356 if args.stats or args.fixnames:
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
357 dupe.statistics()
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
358 else:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
359 dupe.analyze()
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
360 dupe.output()
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
361

mercurial