added stats calc to dupechecker

Tue, 12 Dec 2017 03:25:44 +0100

author
mdd
date
Tue, 12 Dec 2017 03:25:44 +0100
changeset 32
df89a8fba2a2
parent 31
52371bbcde5c
child 33
83bcb5931ee3

added stats calc to dupechecker

dupecheck.py file | annotate | diff | comparison | revisions
--- a/dupecheck.py	Tue Dec 12 03:02:31 2017 +0100
+++ b/dupecheck.py	Tue Dec 12 03:25:44 2017 +0100
@@ -17,6 +17,15 @@
     else:
         return Levenshtein.ratio(a, b)
 
+suffixes = ['b', 'K', 'M', 'G', 'T', 'P']
+def humansize(nbytes):
+    i = 0
+    while nbytes >= 1024 and i < len(suffixes)-1:
+        nbytes /= 1024.
+        i += 1
+    f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
+    return '%s %s' % (f, suffixes[i])
+
 class dupechecker(object):
     """
     Simple class to scan multiple directories recursive,
@@ -52,10 +61,32 @@
                     else:
                         title = " - ".join(title[2:])
                     title = title[:-3].lower()
-                    self.filelist.append([title, filename, root])
+                    self.filelist.append([title, filename, root, ext])
                 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
                     title = filename[:-4].lower()
-                    self.filelist.append([title, filename, root])
+                    self.filelist.append([title, filename, root, ext])
+
+    def statistics(self):
+        """
+        Summarize disk usage and print stats about found filetypes
+        """
+        stats = {}
+        for item in self.filelist:
+            if not item[3] in stats:
+                stats[item[3]] = [0, 0.0]
+            stats[item[3]][0] += 1
+            stats[item[3]][1] += os.stat(
+                os.path.join(
+                    item[2], item[1])).st_size
+        print ("%5s %6s %10s" % (
+            "File:",
+            "Count:",
+            "Size:"))
+        for ext in stats.keys():
+            print ("%5s %6i %10s" % (
+                ext, stats[ext][0],
+                humansize(stats[ext][1])))
+
 
     def analyze(self):
         """
@@ -111,6 +142,8 @@
         help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)')
     parser.add_argument('--difflib', action='store_true', default=False, \
         help='force the use of difflib instead Levenshtein')
+    parser.add_argument('--stats', action='store_true', default=False, \
+        help='generate stats summary instead of check for duplicates')
     parser.add_argument('basedir', metavar='basedir', nargs='+', \
         help='one or more base directories')
 
@@ -132,5 +165,8 @@
 
     for srcstr in args.basedir:
         dupe.scandir(srcstr)
-    dupe.analyze()
-    dupe.output()
+    if args.stats:
+        dupe.statistics()
+    else:
+        dupe.analyze()
+        dupe.output()

mercurial