Wed, 13 Feb 2019 14:10:55 +0100
added support for remote ssh dupe checking against local basedir
3 | 1 | #!/usr/bin/env python |
33 | 2 | # -*- coding: utf-8 -*- |
21 | 3 | """ |
4 | Toolkit / executable to scan for duplicate filenames in movie database | |
5 | ||
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
6 | 2017-2019 by mdd |
21 | 7 | """ |
8 | ||
9 | #pylint: disable=line-too-long | |
10 | #pylint: disable=invalid-name | |
3 | 11 | |
4 | 12 | from __future__ import print_function |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
13 | import os, sys, re |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
14 | import time |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
15 | |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
16 | RE_PARENTHESES = re.compile("[\(\[].*?[\)\]]") |
3 | 17 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
18 | def similarity(a, b): |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
19 | if DIFFLIB: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
20 | return difflib.SequenceMatcher(a=a, b=b).ratio() |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
21 | else: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
22 | return Levenshtein.ratio(a, b) |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
23 | |
32 | 24 | suffixes = ['b', 'K', 'M', 'G', 'T', 'P'] |
25 | def humansize(nbytes): | |
26 | i = 0 | |
27 | while nbytes >= 1024 and i < len(suffixes)-1: | |
28 | nbytes /= 1024. | |
29 | i += 1 | |
30 | f = ('%.2f' % nbytes).rstrip('0').rstrip('.') | |
31 | return '%s %s' % (f, suffixes[i]) | |
32 | ||
33 | 33 | def replace_all(text, dic): |
34 | for i, j in dic.iteritems(): | |
35 | text = text.replace(i, j) | |
36 | return text | |
37 | ||
21 | 38 | class dupechecker(object): |
39 | """ | |
40 | Simple class to scan multiple directories recursive, | |
41 | build a list of movie filenames. | |
42 | analyze the list for duplicates and dump them | |
43 | """ | |
44 | def __init__(self): | |
45 | self.basedir = "" | |
46 | self.filelist = [] | |
47 | self.duplicates = {} | |
48 | self.ratio = 0.85 | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
49 | self.ignore_fileprefix = [] |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
50 | self.ssh = None |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
51 | self.ssh_data = None |
3 | 52 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
53 | |
21 | 54 | def reset(self): |
55 | self.filelist = [] | |
56 | self.duplicates = {} | |
3 | 57 | |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
58 | def __scandir_files(self, root, files, extra=[]): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
59 | for filename in files: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
60 | ext = os.path.splitext(filename)[1].lower() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
61 | if ext == ".ts": |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
62 | #file_path = os.path.join(root, filename) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
63 | title = filename.split(" - ") |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
64 | if len(title) == 1: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
65 | title = title[0] |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
66 | else: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
67 | title = " - ".join(title[2:]) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
68 | title = title[:-3].lower() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
69 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
70 | # remove parentheses with contents in title |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
71 | title = RE_PARENTHESES.sub("", title) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
72 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
73 | self.filelist.append([title, filename, root, ext]) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
74 | elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
75 | title = filename[:-4].lower() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
76 | title = RE_PARENTHESES.sub("", title) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
77 | self.filelist.append([title, filename, root, ext]) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
78 | elif ext in extra: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
79 | title = filename[:-4].lower() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
80 | title = RE_PARENTHESES.sub("", title) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
81 | self.filelist.append([title, filename, root, ext]) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
82 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
83 | |
33 | 84 | def scandir(self, basedir, extra=[]): |
21 | 85 | """ |
86 | Scan a base directory for movie files and add them to | |
87 | the list for analyze | |
88 | """ | |
89 | self.basedir = basedir | |
90 | print("Scanning directory: %s" % basedir) | |
91 | for root, subdirs, files in os.walk(basedir): | |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
92 | self.__scandir_files(root, files, extra) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
93 | # print(repr(self.filelist)) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
94 | # sys.exit() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
95 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
96 | def scandir_remote(self, extra=[]): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
97 | """ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
98 | connect to remote ssh servers and get file lists for duplicate check |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
99 | """ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
100 | print("getting filelist from remote hosts...") |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
101 | try: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
102 | from config import REMOTE_HOSTS |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
103 | except ImportError: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
104 | print("Please configure REMOTE_HOSTS in config.py!") |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
105 | sys.exit(1) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
106 | try: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
107 | import paramiko |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
108 | self.ssh = paramiko.SSHClient() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
109 | #self.ssh.set_missing_host_key_policy(paramiko.WarningPolicy()) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
110 | self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
111 | #self.ssh_key = paramiko.RSAKey.from_private_key_file(SSH_PRIVATE_KEY_FILE) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
112 | except ImportError: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
113 | print("Please install Paramiko!") |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
114 | sys.exit(1) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
115 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
116 | for host in REMOTE_HOSTS: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
117 | self.ssh_data = host |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
118 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
119 | cleanlist = [] |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
120 | lst = self.__ssh_exec('cd %s; ls -1 *.ts' % self.ssh_data['basedir'])[0] |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
121 | for item in lst: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
122 | cleanlist.append(item.strip().encode('ascii','ignore')) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
123 | self.__scandir_files("%s: %s" % ( |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
124 | self.ssh_data['host'], self.ssh_data['basedir']), cleanlist) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
125 | # self.__scandir_files(self.ssh_data['basedir'], cleanlist) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
126 | self.__ssh_disconnect() |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
127 | |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
128 | def __ssh_exec(self, command): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
129 | """ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
130 | establish ssh connection and execute command |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
131 | the connection remains open for following commands until ssh_disconnect is called |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
132 | """ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
133 | if self.ssh is None: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
134 | return None |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
135 | try: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
136 | transport = self.ssh.get_transport() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
137 | if not transport or not transport.is_active(): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
138 | print("SSH: connecting to %s" % self.ssh_data['host']) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
139 | self.ssh.connect(self.ssh_data['host'], self.ssh_data['port'], self.ssh_data['user'], self.ssh_data['pass'], self.ssh_data['key']) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
140 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
141 | # Send the command (non-blocking) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
142 | stdin, stdout, stderr = self.ssh.exec_command(command) |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
143 | |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
144 | # Wait for the command to terminate |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
145 | while not stdout.channel.exit_status_ready() and not stdout.channel.recv_ready(): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
146 | time.sleep(1) |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
147 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
148 | stdoutstring = stdout.readlines() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
149 | stderrstring = stderr.readlines() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
150 | return stdoutstring, stderrstring |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
151 | finally: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
152 | pass |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
153 | |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
154 | def __ssh_disconnect(self): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
155 | """ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
156 | check if ssh is connected and disconnect |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
157 | """ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
158 | if self.ssh is not None: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
159 | # Close client connection. |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
160 | transport = self.ssh.get_transport() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
161 | if not transport or not transport.is_active(): |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
162 | print("SSH: disconnecting") |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
163 | self.ssh.close() |
33 | 164 | |
165 | def fixnames(self): | |
166 | """ | |
167 | Search for defect filenames and remove illegal characters | |
168 | """ | |
169 | import re | |
170 | for item in self.filelist: | |
171 | if not item[3] in ['.mkv', '.txt']: | |
172 | continue | |
173 | # any non-alphanumeric characters in filename? | |
174 | cleanfn = replace_all(item[1], { | |
175 | #'ä':'ae', 'Ä':'Ae', | |
176 | #'ö':'oe', 'Ö':'Oe', | |
177 | #'ü':'ue', 'Ü':'Ue', | |
178 | 'ß':'ss', | |
179 | }) | |
180 | cleanfn = re.sub(r'[^A-Za-z0-9\.\_\-\(\)\&öäüÖÄÜ\' ]', '-', cleanfn) | |
181 | if item[1] == cleanfn: | |
182 | continue | |
183 | print (item[1]) | |
184 | os.rename( | |
185 | os.path.join(item[2], item[1]), | |
186 | os.path.join(item[2], cleanfn) | |
187 | ) | |
32 | 188 | |
189 | def statistics(self): | |
190 | """ | |
191 | Summarize disk usage and print stats about found filetypes | |
192 | """ | |
193 | stats = {} | |
194 | for item in self.filelist: | |
195 | if not item[3] in stats: | |
196 | stats[item[3]] = [0, 0.0] | |
197 | stats[item[3]][0] += 1 | |
198 | stats[item[3]][1] += os.stat( | |
199 | os.path.join( | |
200 | item[2], item[1])).st_size | |
201 | print ("%5s %6s %10s" % ( | |
202 | "File:", | |
203 | "Count:", | |
204 | "Size:")) | |
33 | 205 | sum_count = 0 |
206 | sum_size = 0.0 | |
32 | 207 | for ext in stats.keys(): |
33 | 208 | sum_count += stats[ext][0] |
209 | sum_size += stats[ext][1] | |
32 | 210 | print ("%5s %6i %10s" % ( |
211 | ext, stats[ext][0], | |
212 | humansize(stats[ext][1]))) | |
33 | 213 | print ("%5s %6i %10s" % ( |
214 | "TOTAL", sum_count, | |
215 | humansize(sum_size))) | |
32 | 216 | |
21 | 217 | |
218 | def analyze(self): | |
219 | """ | |
220 | Analyze the scanlist for duplicates | |
221 | """ | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
222 | listlen = len(self.filelist) |
21 | 223 | print("%i files to analyze, running duplicate testing loop..." % ( |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
224 | listlen)) |
3 | 225 | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
226 | # remove potentially unwanted entries from the list |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
227 | if len(self.ignore_fileprefix) > 0: |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
228 | for idx in reversed(range(listlen)): |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
229 | for tst in self.ignore_fileprefix: |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
230 | if tst == '': |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
231 | continue |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
232 | if self.filelist[idx][0].startswith(tst): |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
233 | del self.filelist[idx] |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
234 | break |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
235 | listlen = len(self.filelist) |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
236 | |
21 | 237 | for idx in range(listlen): |
238 | if not self.filelist[idx]: | |
239 | continue | |
240 | print("\r%d %s\033[K" % ( | |
241 | idx, self.filelist[idx][0]), end='') | |
242 | sys.stdout.flush() | |
243 | for idx2 in range(idx + 1, listlen): | |
244 | if self.filelist[idx2]: | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
245 | if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio: |
21 | 246 | #print "possible duplicate %d %s" % (idx2, item2[0]) |
247 | key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
248 | if not key in self.duplicates: |
21 | 249 | self.duplicates[key] = [] |
250 | self.duplicates[key].append( | |
251 | os.path.join( | |
252 | self.filelist[idx2][2], | |
253 | self.filelist[idx2][1] | |
254 | )) | |
255 | # unset the found duplicate, so that this will not be scanned again | |
256 | self.filelist[idx2] = None | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
257 | print("\n\n") |
3 | 258 | |
21 | 259 | def output(self): |
260 | """ | |
261 | Dump found duplicates to console | |
262 | """ | |
263 | idx = 1 | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
264 | for base in self.duplicates: |
21 | 265 | print("Duplicate file set #%i" % idx) |
266 | print(base) | |
267 | for dup in self.duplicates[base]: | |
268 | print(dup) | |
269 | print() | |
270 | idx += 1 | |
271 | ||
272 | ||
273 | if __name__ == "__main__": | |
274 | # parse command line options | |
275 | import argparse | |
276 | ||
277 | parser = argparse.ArgumentParser(\ | |
278 | description='Movie database filename duplicate checker') | |
279 | parser.add_argument('--ratio', type=float, default=0.85, \ | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
280 | help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
281 | parser.add_argument('--difflib', action='store_true', default=False, \ |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
282 | help='force the use of difflib instead Levenshtein') |
32 | 283 | parser.add_argument('--stats', action='store_true', default=False, \ |
284 | help='generate stats summary instead of check for duplicates') | |
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
285 | parser.add_argument('--remote', action='store_true', default=False, \ |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
286 | help='Connect to ssh remotes, eg. dupecheck for dreambox local storage') |
33 | 287 | parser.add_argument('--fixnames', action='store_true', default=False, \ |
288 | help='scan for mkv and txt, fix broken filenames for windows') | |
21 | 289 | parser.add_argument('basedir', metavar='basedir', nargs='+', \ |
290 | help='one or more base directories') | |
291 | ||
292 | args = parser.parse_args() | |
293 | dupe = dupechecker() | |
294 | dupe.ratio = args.ratio | |
22
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
295 | if args.difflib: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
296 | DIFFLIB = True |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
297 | import difflib |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
298 | else: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
299 | try: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
300 | import Levenshtein |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
301 | DIFFLIB = False |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
302 | except ImportError: |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
303 | import difflib |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
304 | DIFFLIB = True |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
305 | print("Consider 'pip install python-Levenshtein' for faster analyze") |
c18abd9198c0
implemented Levenshtein algorithm for incredible speedup
mdd
parents:
21
diff
changeset
|
306 | |
35
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
307 | if os.path.isfile("dupecheck-ignore.txt"): |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
308 | # read the entire file line by line into buffer |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
309 | print("Loading ignore filename prefixes file for dupe checking...") |
14c966c10648
added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents:
33
diff
changeset
|
310 | dupe.ignore_fileprefix = [line.rstrip('\n').rstrip('\r') for line in open("dupecheck-ignore.txt", "rb")] |
21 | 311 | |
33 | 312 | if args.fixnames: |
313 | for srcstr in args.basedir: | |
314 | dupe.scandir(srcstr, ['.txt']) | |
315 | if len(dupe.filelist) > 0: | |
316 | print ("Checking %i file names..." % len(dupe.filelist)) | |
317 | dupe.fixnames() | |
318 | dupe.filelist = [] | |
319 | sys.exit(0) | |
320 | ||
36
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
321 | if args.remote: |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
322 | dupe.scandir_remote() |
a1ad6f4728be
added support for remote ssh dupe checking against local basedir
mdd
parents:
35
diff
changeset
|
323 | |
21 | 324 | for srcstr in args.basedir: |
325 | dupe.scandir(srcstr) | |
33 | 326 | |
327 | if args.stats or args.fixnames: | |
32 | 328 | dupe.statistics() |
329 | else: | |
330 | dupe.analyze() | |
331 | dupe.output() | |
33 | 332 |