Wed, 29 Nov 2017 23:34:51 +0100
implemented Levenshtein algorithm for incredible speedup
#!/usr/bin/python # -*- coding: utf-8 -*- # iso-8859-2 """ EitSupport Copyright (C) 2011 betonme Copyright (C) 2016 Wolfgang Fahl Cleanup 2017 by mdd """ # This EITParser is based on: # https://github.com/betonme/e2openplugin-EnhancedMovieCenter/blob/master/src/EitSupport.py # # In case of reuse of this source code please do not remove this copyright. # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # For more information on the GNU General Public License see: # <http://www.gnu.org/licenses/>. # # seite 36, inhalt der for schleife! # https://www.dvb.org/resources/public/standards/a38_dvb-si_specification.pdf #pylint: disable=missing-docstring #pylint: disable=line-too-long import os import struct import sys import getopt from datetime import datetime from ISO639 import LanguageCodes #def crc32(data): # poly = 0x4c11db7 # crc = 0xffffffffL # for byte in data: # byte = ord(byte) # for bit in range(7,-1,-1): # MSB to LSB # z32 = crc>>31 # top bit # crc = crc << 1 # if ((byte>>bit)&1) ^ z32: # crc = crc ^ poly # crc = crc & 0xffffffffL # return crc EIT_SHORT_EVENT_DESCRIPTOR = 0x4d EIT_EXTENDED_EVENT_DESCRIPOR = 0x4e CHARSPEC_HR = { u'Ć': u'\u0106', u'æ': u'\u0107', u'®': u'\u017D', u'¾': u'\u017E', u'©': u'\u0160', u'¹': u'\u0161', u'Č': u'\u010C', u'è': u'\u010D', u'ð': u'\u0111' } CHARSPEC_CZSK = { u'Ï'+u'C': u'Č', u'Ï'+u'E': u'Ě', u'Ï'+u'L': u'Ľ', u'Ï'+u'N': u'Ň', u'Ï'+u'R': u'Ř', u'Ï'+u'S': u'Š', u'Ï'+u'T': u'Ť', u'Ï'+u'Z': u'Ž', u'Ï'+u'c': u'č', u'Ï'+u'd': u'ď', u'Ï'+u'e': u'ě', u'Ï'+u'l': u'ľ', u'Ï'+u'n': u'ň', u'Ï'+u'r': u'ř', u'Ï'+u's': u'š', u'Ï'+u't': u'ť', u'Ï'+u'z': u'ž', u'Ï'+u'D': u'Ď', u'Â'+u'A': u'Á', u'Â'+u'E': u'É', u'Â'+u'I': u'Í', u'Â'+u'O': u'Ó', u'Â'+u'U': u'Ú', u'Â'+u'a': u'á', u'Â'+u'e': u'é', u'Â'+u'i': u'í', u'Â'+u'o': u'ó', u'Â'+u'u': u'ú', u'Â'+u'y': u'ý', u'Ã'+u'o': u'ô', u'Ã'+u'O': u'Ô', u'Ê'+u'u': u'ů', u'Ê'+u'U': u'Ů', u'È'+u'A': u'Ä', u'È'+u'E': u'Ë', u'È'+u'I': u'Ï', u'È'+u'O': u'Ö', u'È'+u'U': u'Ü', u'È'+u'Y': u'Ÿ', u'È'+u'a': u'ä', u'È'+u'e': u'ë', u'È'+u'i': u'ï', u'È'+u'o': u'ö', u'È'+u'u': u'ü', u'È'+u'y': u'ÿ' } def convert_charspec_hr(text): for i, j in CHARSPEC_HR.iteritems(): text = text.replace(i, j) return text def convert_charspec_czsk(text): for i, j in CHARSPEC_CZSK.iteritems(): text = text.replace(i, j) return text def parse_mjd(mjd): """Parse 16 bit unsigned int containing Modified Julian Date, as per DVB-SI spec returning year,month,day""" year = int((mjd - 15078.2) / 365.25) month = int((mjd - 14956.1 - int(year * 365.25)) / 30.6001) day = mjd - 14956 - int(year * 365.25) - int(month * 30.6001) correction = 0 if month == 14 or month == 15: correction = 1 return (1900 + year + correction), (month - 1 - correction * 12), day def bcd2dec(byte): return (byte >> 4) * 10 + (byte & 0xf) def mkint(data): """ Convert string to Integer """ return int(data) if data else 0 def todate(sdate, stime): """ Convert date and time to datetime tuple """ if sdate and stime: try: return datetime( int(sdate[0]), int(sdate[1]), int(sdate[2]), int(stime[0]), int(stime[1])) except ValueError: return None else: return None def cleanstring(data): """remove nonprintable chars from short desc """ for char in ['\x10', '\x00', '\x02', '\x15']: data = data.replace(char, '') return data def language_iso639_2to3(alpha2): ret = alpha2 if alpha2 in LanguageCodes: language = LanguageCodes[alpha2] for alpha, name in LanguageCodes.items(): if name == language: if len(alpha) == 3: return alpha return ret class eitinfo(object): """Eit File support class Description http://de.wikipedia.org/wiki/Event_Information_Table """ def __init__(self, path=None): self.eit_file = None self.eit = {} #self.iso = None self.load(path) def load(self, path): self.eit = {} self.eit_file = None if path: self.eit_file = path self._read_file() def get_genre(self): return self.eit.get('genre', "") def get_components(self): return self.eit.get('components', "") def get_startdate(self): return self.eit.get('startdate', "") def get_starttime(self): return self.eit.get('starttime', "") def get_duration(self): return self.eit.get('duration', "") def get_name(self): return self.eit.get('name', "").strip() def get_description(self): return self.eit.get('description', "").strip() def get_duration_seconds(self): length = self.eit.get('duration', "") if len(length) > 2: return mkint((length[0] * 60 + length[1]) * 60 + length[2]) elif len(length) > 1: return mkint(length[0] * 60 + length[1]) else: return mkint(length) def get_date(self): return todate(self.get_startdate(), self.get_starttime()) def dump(self): """Module docstring. Read Eit File and show the information. """ if len(self.eit) == 0: return None out = "Movie name: %s" % self.get_name() out += "\nGenre: %s" % self.get_genre() out += "\nComponents: %s" % self.get_components() out += "\nStartDate: %s" % self.get_date() out += "\nDescription: %s" % self.get_description() out += "\nDuration: %02i:%02i:%02i" % self.get_duration() out += " (%s minutes)" % (self.get_duration_seconds() / 60) print out return out ############################################################################## ## File IO Functions def _read_file(self): data = "" path = self.eit_file lang = language_iso639_2to3("de") if path and os.path.exists(path): print "Reading Event Information Table " + str(path) # Read data from file fd = None try: fd = open(path, 'rb') #lines = f.readlines() data = fd.read() except Exception, err: print "[META] Exception in readEitFile: " + str(err) finally: if fd is not None: fd.close() # Parse the data if data and 12 <= len(data): # go through events pos = 0 e = struct.unpack(">HHBBBBBBH", data[pos:pos + 12]) event_id = e[0] date = parse_mjd(e[1]) # Y, M, D time = bcd2dec(e[2]), bcd2dec(e[3]), bcd2dec(e[4]) # HH, MM, SS duration = bcd2dec(e[5]), bcd2dec(e[6]), bcd2dec(e[7]) # HH, MM, SS #running_status = (e[8] & 0xe000) >> 13 #free_CA_mode = e[8] & 0x1000 descriptors_len = e[8] & 0x0fff #if running_status in [1, 2]: # self.eit['when'] = "NEXT" #elif running_status in [3, 4]: # self.eit['when'] = "NOW" self.eit['startdate'] = date self.eit['starttime'] = time self.eit['duration'] = duration pos = pos + 12 short_event_descriptor = [] short_event_descriptor_multi = [] extended_event_descriptor = [] extended_event_descriptor_multi = [] component_descriptor = [] content_descriptor = [] linkage_descriptor = [] parental_rating_descriptor = [] endpos = len(data) - 1 while pos < endpos: rec = ord(data[pos]) length = ord(data[pos + 1]) + 2 if rec == 0x4D: descriptor_tag = ord(data[pos + 1]) descriptor_length = ord(data[pos + 2]) ISO_639_language_code = str(data[pos + 3:pos + 5]) event_name_length = ord(data[pos + 5]) short_event_description = cleanstring(data[pos + 6:pos + 6 + event_name_length]) tmp_length = ord(data[pos + 6 + event_name_length]) self.eit['genre'] = cleanstring(data[pos + 7 + event_name_length:pos + 7 + tmp_length + event_name_length]) if ISO_639_language_code == lang: short_event_descriptor.append(short_event_description) short_event_descriptor_multi.append(short_event_description) elif rec == 0x4E: ISO_639_language_code = str(data[pos + 3:pos + 5]) extended_event_description = "" extended_event_description_multi = "" for i in range(pos + 8, pos + length): if str(ord(data[i])) == "138": extended_event_description += '\n' extended_event_description_multi += '\n' elif data[i] not in ['\x10', '\x00', '\x02', '\x15']: extended_event_description += data[i] extended_event_description_multi += data[i] if ISO_639_language_code == lang: extended_event_descriptor.append(extended_event_description) extended_event_descriptor_multi.append(extended_event_description) elif rec == 0x50: #tmp_type = ord(data[pos + 3:pos + 4]) #print "type: %x" % tmp_type component_descriptor.append(cleanstring(data[pos + 8:pos + length])) elif rec == 0x54: content_descriptor.append(cleanstring(data[pos + 8:pos + length])) elif rec == 0x4A: linkage_descriptor.append(cleanstring(data[pos + 8:pos + length])) elif rec == 0x55: parental_rating_descriptor.append(cleanstring(data[pos + 2:pos + length])) else: print "unsupported descriptor: %x %x" % (rec, pos + 12) print data[pos:pos + length] pos += length self.eit['components'] = ", ".join(component_descriptor) # Very bad but there can be both encodings # User files can be in cp1252 # Is there no other way? if short_event_descriptor: short_event_descriptor = "".join(short_event_descriptor) else: short_event_descriptor = "".join(short_event_descriptor_multi) if short_event_descriptor: #try: # short_event_descriptor = short_event_descriptor.decode("iso-8859-1").encode("utf-8") #except UnicodeDecodeError: # pass try: short_event_descriptor.decode('utf-8') except UnicodeDecodeError: try: short_event_descriptor = short_event_descriptor.decode("cp1252").encode("utf-8") except UnicodeDecodeError: # do nothing, otherwise cyrillic wont properly displayed #short_event_descriptor = short_event_descriptor.decode("iso-8859-1").encode("utf-8") pass if (lang == "cs") or (lang == "sk"): short_event_descriptor = str(convert_charspec_czsk(short_event_descriptor)) if lang == "hr": short_event_descriptor = str(convert_charspec_hr(short_event_descriptor)) self.eit['name'] = short_event_descriptor # Very bad but there can be both encodings # User files can be in cp1252 # Is there no other way? if extended_event_descriptor: extended_event_descriptor = "".join(extended_event_descriptor) else: extended_event_descriptor = "".join(extended_event_descriptor_multi) if extended_event_descriptor: #try: # extended_event_descriptor = extended_event_descriptor.decode("iso-8859-1").encode("utf-8") #except UnicodeDecodeError: # pass try: extended_event_descriptor.decode('utf-8') except UnicodeDecodeError: try: extended_event_descriptor = extended_event_descriptor.decode("cp1252").encode("utf-8") except UnicodeDecodeError: # do nothing, otherwise cyrillic wont properly displayed #extended_event_descriptor = extended_event_descriptor.decode("iso-8859-1").encode("utf-8") pass if (lang == "cs") or (lang == "sk"): extended_event_descriptor = str(convert_charspec_czsk(extended_event_descriptor)) if lang == "hr": extended_event_descriptor = str(convert_charspec_hr(extended_event_descriptor)) self.eit['description'] = extended_event_descriptor else: # No data clear all self.eit = {} def main(): # parse command line options try: opts, args = getopt.getopt(sys.argv[1:], "h", ["help"]) except getopt.error, msg: print msg print "for help use --help" sys.exit(2) # process options for o, a in opts: if o in ("-h", "--help"): print __doc__ sys.exit(0) # process arguments info = eitinfo() for arg in args: info.load(arg) info.dump() if __name__ == "__main__": main()