Mon, 09 Mar 2020 12:19:29 +0100
optimized fixnames for kodi
3 | 1 | #!/usr/bin/env python |
33 | 2 | # -*- coding: utf-8 -*- |
21 | 3 | """ |
4 | Toolkit / executable to scan for duplicate filenames in movie database | |
37 | 5 | More functions: |
6 | * sanitize filenames | |
7 | * statistics | |
21 | 8 | |
37 | 9 | 2017-2020 by mdd |
21 | 10 | """ |
11 | ||
12 | #pylint: disable=line-too-long | |
13 | #pylint: disable=invalid-name | |
3 | 14 | |
4 | 15 | from __future__ import print_function |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
16 | import os, sys, re |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
17 | import time |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
18 | |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
19 | RE_PARENTHESES = re.compile("[\(\[].*?[\)\]]") |
3 | 20 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
21 | def similarity(a, b): |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
22 | if DIFFLIB: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
23 | return difflib.SequenceMatcher(a=a, b=b).ratio() |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
24 | else: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
25 | return Levenshtein.ratio(a, b) |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
26 | |
32 | 27 | suffixes = ['b', 'K', 'M', 'G', 'T', 'P'] |
28 | def humansize(nbytes): | |
29 | i = 0 | |
30 | while nbytes >= 1024 and i < len(suffixes)-1: | |
31 | nbytes /= 1024. | |
32 | i += 1 | |
33 | f = ('%.2f' % nbytes).rstrip('0').rstrip('.') | |
34 | return '%s %s' % (f, suffixes[i]) | |
35 | ||
33 | 36 | def replace_all(text, dic): |
37 | for i, j in dic.iteritems(): | |
38 | text = text.replace(i, j) | |
39 | return text | |
40 | ||
21 | 41 | class dupechecker(object): |
42 | """ | |
43 | Simple class to scan multiple directories recursive, | |
44 | build a list of movie filenames. | |
45 | analyze the list for duplicates and dump them | |
46 | """ | |
47 | def __init__(self): | |
48 | self.basedir = "" | |
49 | self.filelist = [] | |
50 | self.duplicates = {} | |
51 | self.ratio = 0.85 | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
52 | self.ignore_fileprefix = [] |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
53 | self.ssh = None |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
54 | self.ssh_data = None |
3 | 55 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
56 | |
21 | 57 | def reset(self): |
58 | self.filelist = [] | |
59 | self.duplicates = {} | |
3 | 60 | |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
61 | def __scandir_files(self, root, files, extra=[]): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
62 | for filename in files: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
63 | ext = os.path.splitext(filename)[1].lower() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
64 | if ext == ".ts": |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
65 | #file_path = os.path.join(root, filename) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
66 | title = filename.split(" - ") |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
67 | if len(title) == 1: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
68 | title = title[0] |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
69 | else: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
70 | title = " - ".join(title[2:]) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
71 | title = title[:-3].lower() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
72 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
73 | # remove parentheses with contents in title |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
74 | title = RE_PARENTHESES.sub("", title) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
75 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
76 | self.filelist.append([title, filename, root, ext]) |
37 | 77 | elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4', '.ogg', '.mp3', '.iso']: |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
78 | title = filename[:-4].lower() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
79 | title = RE_PARENTHESES.sub("", title) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
80 | self.filelist.append([title, filename, root, ext]) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
81 | elif ext in extra: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
82 | title = filename[:-4].lower() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
83 | title = RE_PARENTHESES.sub("", title) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
84 | self.filelist.append([title, filename, root, ext]) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
85 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
86 | |
33 | 87 | def scandir(self, basedir, extra=[]): |
21 | 88 | """ |
89 | Scan a base directory for movie files and add them to | |
90 | the list for analyze | |
91 | """ | |
92 | self.basedir = basedir | |
93 | print("Scanning directory: %s" % basedir) | |
94 | for root, subdirs, files in os.walk(basedir): | |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
95 | self.__scandir_files(root, files, extra) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
96 | # print(repr(self.filelist)) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
97 | # sys.exit() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
98 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
99 | def scandir_remote(self, extra=[]): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
100 | """ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
101 | connect to remote ssh servers and get file lists for duplicate check |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
102 | """ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
103 | print("getting filelist from remote hosts...") |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
104 | try: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
105 | from config import REMOTE_HOSTS |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
106 | except ImportError: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
107 | print("Please configure REMOTE_HOSTS in config.py!") |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
108 | sys.exit(1) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
109 | try: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
110 | import paramiko |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
111 | self.ssh = paramiko.SSHClient() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
112 | #self.ssh.set_missing_host_key_policy(paramiko.WarningPolicy()) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
113 | self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
114 | #self.ssh_key = paramiko.RSAKey.from_private_key_file(SSH_PRIVATE_KEY_FILE) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
115 | except ImportError: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
116 | print("Please install Paramiko!") |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
117 | sys.exit(1) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
118 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
119 | for host in REMOTE_HOSTS: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
120 | self.ssh_data = host |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
121 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
122 | cleanlist = [] |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
123 | lst = self.__ssh_exec('cd %s; ls -1 *.ts' % self.ssh_data['basedir'])[0] |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
124 | for item in lst: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
125 | cleanlist.append(item.strip().encode('ascii','ignore')) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
126 | self.__scandir_files("%s: %s" % ( |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
127 | self.ssh_data['host'], self.ssh_data['basedir']), cleanlist) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
128 | # self.__scandir_files(self.ssh_data['basedir'], cleanlist) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
129 | self.__ssh_disconnect() |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
130 | |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
131 | def __ssh_exec(self, command): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
132 | """ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
133 | establish ssh connection and execute command |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
134 | the connection remains open for following commands until ssh_disconnect is called |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
135 | """ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
136 | if self.ssh is None: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
137 | return None |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
138 | try: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
139 | transport = self.ssh.get_transport() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
140 | if not transport or not transport.is_active(): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
141 | print("SSH: connecting to %s" % self.ssh_data['host']) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
142 | self.ssh.connect(self.ssh_data['host'], self.ssh_data['port'], self.ssh_data['user'], self.ssh_data['pass'], self.ssh_data['key']) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
143 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
144 | # Send the command (non-blocking) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
145 | stdin, stdout, stderr = self.ssh.exec_command(command) |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
146 | |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
147 | # Wait for the command to terminate |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
148 | while not stdout.channel.exit_status_ready() and not stdout.channel.recv_ready(): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
149 | time.sleep(1) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
150 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
151 | stdoutstring = stdout.readlines() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
152 | stderrstring = stderr.readlines() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
153 | return stdoutstring, stderrstring |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
154 | finally: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
155 | pass |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
156 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
157 | def __ssh_disconnect(self): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
158 | """ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
159 | check if ssh is connected and disconnect |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
160 | """ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
161 | if self.ssh is not None: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
162 | # Close client connection. |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
163 | transport = self.ssh.get_transport() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
164 | if not transport or not transport.is_active(): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
165 | print("SSH: disconnecting") |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
166 | self.ssh.close() |
33 | 167 | |
168 | def fixnames(self): | |
169 | """ | |
170 | Search for defect filenames and remove illegal characters | |
171 | """ | |
172 | import re | |
173 | for item in self.filelist: | |
37 | 174 | if not item[3] in ['.mkv', '.txt', '.nfo']: |
33 | 175 | continue |
176 | # any non-alphanumeric characters in filename? | |
177 | cleanfn = replace_all(item[1], { | |
178 | #'ä':'ae', 'Ä':'Ae', | |
179 | #'ö':'oe', 'Ö':'Oe', | |
180 | #'ü':'ue', 'Ü':'Ue', | |
37 | 181 | 'ß': 'ss', |
182 | ':': ' -', | |
33 | 183 | }) |
37 | 184 | cleanfn = re.sub(r'[^A-Za-z0-9\.\_\-\(\)\[\]\{\}\&öäüÖÄÜ\' ]', '-', cleanfn) |
185 | ||
186 | # if cleanfn.endswith(".nfo"): | |
187 | # # force .txt ending of nfo files | |
188 | # # TODO: later there should nfo files for kodi | |
189 | # cleanfn = cleanfn[:-4] + ".txt" | |
190 | ||
191 | checklist = re.findall(r'\([^\(\)]+\)', cleanfn) | |
192 | for nonyear in checklist: | |
193 | if re.match(r'\(\d{4}\)', nonyear): | |
194 | continue | |
195 | cleanfn = replace_all(cleanfn, { | |
196 | nonyear: replace_all(nonyear, {'(':'[', ')':']'}) | |
197 | }) | |
198 | #print ("NONYEAR: ", nonyear) | |
199 | ||
200 | checklist = re.findall(r'\[\d{4}[^\]]+\]', cleanfn) | |
201 | for year in checklist: | |
202 | cleanfn = replace_all(cleanfn, { | |
203 | year: replace_all(year, { | |
204 | year[:5]: '(' + year[1:5] + ') [' | |
205 | }) | |
206 | }) | |
207 | # print ("YEAR: ", year) | |
208 | ||
209 | ||
33 | 210 | if item[1] == cleanfn: |
211 | continue | |
37 | 212 | print (item[1], " -> ", cleanfn) |
33 | 213 | os.rename( |
214 | os.path.join(item[2], item[1]), | |
215 | os.path.join(item[2], cleanfn) | |
216 | ) | |
32 | 217 | |
218 | def statistics(self): | |
219 | """ | |
220 | Summarize disk usage and print stats about found filetypes | |
221 | """ | |
222 | stats = {} | |
223 | for item in self.filelist: | |
224 | if not item[3] in stats: | |
225 | stats[item[3]] = [0, 0.0] | |
226 | stats[item[3]][0] += 1 | |
227 | stats[item[3]][1] += os.stat( | |
228 | os.path.join( | |
229 | item[2], item[1])).st_size | |
230 | print ("%5s %6s %10s" % ( | |
231 | "File:", | |
232 | "Count:", | |
233 | "Size:")) | |
33 | 234 | sum_count = 0 |
235 | sum_size = 0.0 | |
32 | 236 | for ext in stats.keys(): |
33 | 237 | sum_count += stats[ext][0] |
238 | sum_size += stats[ext][1] | |
32 | 239 | print ("%5s %6i %10s" % ( |
240 | ext, stats[ext][0], | |
241 | humansize(stats[ext][1]))) | |
33 | 242 | print ("%5s %6i %10s" % ( |
243 | "TOTAL", sum_count, | |
244 | humansize(sum_size))) | |
32 | 245 | |
21 | 246 | |
247 | def analyze(self): | |
248 | """ | |
249 | Analyze the scanlist for duplicates | |
250 | """ | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
251 | listlen = len(self.filelist) |
21 | 252 | print("%i files to analyze, running duplicate testing loop..." % ( |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
253 | listlen)) |
3 | 254 | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
255 | # remove potentially unwanted entries from the list |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
256 | if len(self.ignore_fileprefix) > 0: |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
257 | for idx in reversed(range(listlen)): |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
258 | for tst in self.ignore_fileprefix: |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
259 | if tst == '': |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
260 | continue |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
261 | if self.filelist[idx][0].startswith(tst): |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
262 | del self.filelist[idx] |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
263 | break |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
264 | listlen = len(self.filelist) |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
265 | |
21 | 266 | for idx in range(listlen): |
267 | if not self.filelist[idx]: | |
268 | continue | |
269 | print("\r%d %s\033[K" % ( | |
270 | idx, self.filelist[idx][0]), end='') | |
271 | sys.stdout.flush() | |
272 | for idx2 in range(idx + 1, listlen): | |
273 | if self.filelist[idx2]: | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
274 | if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio: |
21 | 275 | #print "possible duplicate %d %s" % (idx2, item2[0]) |
276 | key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
277 | if not key in self.duplicates: |
21 | 278 | self.duplicates[key] = [] |
279 | self.duplicates[key].append( | |
280 | os.path.join( | |
281 | self.filelist[idx2][2], | |
282 | self.filelist[idx2][1] | |
283 | )) | |
284 | # unset the found duplicate, so that this will not be scanned again | |
285 | self.filelist[idx2] = None | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
286 | print("\n\n") |
3 | 287 | |
21 | 288 | def output(self): |
289 | """ | |
290 | Dump found duplicates to console | |
291 | """ | |
292 | idx = 1 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
293 | for base in self.duplicates: |
21 | 294 | print("Duplicate file set #%i" % idx) |
295 | print(base) | |
296 | for dup in self.duplicates[base]: | |
297 | print(dup) | |
298 | print() | |
299 | idx += 1 | |
300 | ||
301 | ||
302 | if __name__ == "__main__": | |
303 | # parse command line options | |
304 | import argparse | |
305 | ||
306 | parser = argparse.ArgumentParser(\ | |
307 | description='Movie database filename duplicate checker') | |
308 | parser.add_argument('--ratio', type=float, default=0.85, \ | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
309 | help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
310 | parser.add_argument('--difflib', action='store_true', default=False, \ |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
311 | help='force the use of difflib instead Levenshtein') |
32 | 312 | parser.add_argument('--stats', action='store_true', default=False, \ |
313 | help='generate stats summary instead of check for duplicates') | |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
314 | parser.add_argument('--remote', action='store_true', default=False, \ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
315 | help='Connect to ssh remotes, eg. dupecheck for dreambox local storage') |
33 | 316 | parser.add_argument('--fixnames', action='store_true', default=False, \ |
317 | help='scan for mkv and txt, fix broken filenames for windows') | |
21 | 318 | parser.add_argument('basedir', metavar='basedir', nargs='+', \ |
319 | help='one or more base directories') | |
320 | ||
321 | args = parser.parse_args() | |
322 | dupe = dupechecker() | |
323 | dupe.ratio = args.ratio | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
324 | if args.difflib: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
325 | DIFFLIB = True |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
326 | import difflib |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
327 | else: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
328 | try: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
329 | import Levenshtein |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
330 | DIFFLIB = False |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
331 | except ImportError: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
332 | import difflib |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
333 | DIFFLIB = True |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
334 | print("Consider 'pip install python-Levenshtein' for faster analyze") |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
335 | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
336 | if os.path.isfile("dupecheck-ignore.txt"): |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
337 | # read the entire file line by line into buffer |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
338 | print("Loading ignore filename prefixes file for dupe checking...") |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
339 | dupe.ignore_fileprefix = [line.rstrip('\n').rstrip('\r') for line in open("dupecheck-ignore.txt", "rb")] |
21 | 340 | |
33 | 341 | if args.fixnames: |
342 | for srcstr in args.basedir: | |
37 | 343 | dupe.scandir(srcstr, ['.txt', '.nfo']) |
33 | 344 | if len(dupe.filelist) > 0: |
345 | print ("Checking %i file names..." % len(dupe.filelist)) | |
346 | dupe.fixnames() | |
347 | dupe.filelist = [] | |
348 | sys.exit(0) | |
349 | ||
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
350 | if args.remote: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
351 | dupe.scandir_remote() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
352 | |
21 | 353 | for srcstr in args.basedir: |
354 | dupe.scandir(srcstr) | |
33 | 355 | |
356 | if args.stats or args.fixnames: | |
32 | 357 | dupe.statistics() |
358 | else: | |
359 | dupe.analyze() | |
360 | dupe.output() | |
33 | 361 |