mirror of
https://github.com/SickGear/SickGear.git
synced 2024-11-22 04:45:05 +00:00
e56303798c
Initial SickGear for Python 3.
833 lines
30 KiB
Python
833 lines
30 KiB
Python
# -*- coding: utf-8 -*-
|
|
# enzyme - Video metadata parser
|
|
# Copyright 2011-2012 Antoine Bertin <diaoulael@gmail.com>
|
|
# Copyright 2003-2006 Thomas Schueppel <stain@acm.org>
|
|
# Copyright 2003-2006 Dirk Meyer <dischi@freevo.org>
|
|
# Copyright 2003-2006 Jason Tackaberry <tack@urandom.ca>
|
|
#
|
|
# This file is part of enzyme.
|
|
#
|
|
# enzyme is free software; you can redistribute it and/or modify it under
|
|
# the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# enzyme is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with enzyme. If not, see <http://www.gnu.org/licenses/>.
|
|
from datetime import datetime
|
|
from .exceptions import ParseError
|
|
from struct import unpack
|
|
from . import core
|
|
import logging
|
|
import re
|
|
from _23 import decode_str
|
|
from six import byte2int, indexbytes
|
|
|
|
__all__ = ['Parser']
|
|
|
|
# get logging object
|
|
log = logging.getLogger(__name__)
|
|
|
|
# Main IDs for the Matroska streams
|
|
MATROSKA_VIDEO_TRACK = 0x01
|
|
MATROSKA_AUDIO_TRACK = 0x02
|
|
MATROSKA_SUBTITLES_TRACK = 0x11
|
|
|
|
MATROSKA_HEADER_ID = 0x1A45DFA3
|
|
MATROSKA_TRACKS_ID = 0x1654AE6B
|
|
MATROSKA_CUES_ID = 0x1C53BB6B
|
|
MATROSKA_SEGMENT_ID = 0x18538067
|
|
MATROSKA_SEGMENT_INFO_ID = 0x1549A966
|
|
MATROSKA_CLUSTER_ID = 0x1F43B675
|
|
MATROSKA_VOID_ID = 0xEC
|
|
MATROSKA_CRC_ID = 0xBF
|
|
MATROSKA_TIMECODESCALE_ID = 0x2AD7B1
|
|
MATROSKA_DURATION_ID = 0x4489
|
|
MATROSKA_CRC32_ID = 0xBF
|
|
# MATROSKA_TIMECODESCALE_ID = 0x2AD7B1
|
|
MATROSKA_MUXING_APP_ID = 0x4D80
|
|
MATROSKA_WRITING_APP_ID = 0x5741
|
|
MATROSKA_CODEC_ID = 0x86
|
|
MATROSKA_CODEC_PRIVATE_ID = 0x63A2
|
|
MATROSKA_FRAME_DURATION_ID = 0x23E383
|
|
MATROSKA_VIDEO_SETTINGS_ID = 0xE0
|
|
MATROSKA_VIDEO_WIDTH_ID = 0xB0
|
|
MATROSKA_VIDEO_HEIGHT_ID = 0xBA
|
|
MATROSKA_VIDEO_INTERLACED_ID = 0x9A
|
|
MATROSKA_VIDEO_DISPLAY_WIDTH_ID = 0x54B0
|
|
MATROSKA_VIDEO_DISPLAY_HEIGHT_ID = 0x54BA
|
|
MATROSKA_AUDIO_SETTINGS_ID = 0xE1
|
|
MATROSKA_AUDIO_SAMPLERATE_ID = 0xB5
|
|
MATROSKA_AUDIO_CHANNELS_ID = 0x9F
|
|
MATROSKA_TRACK_UID_ID = 0x73C5
|
|
MATROSKA_TRACK_NUMBER_ID = 0xD7
|
|
MATROSKA_TRACK_TYPE_ID = 0x83
|
|
MATROSKA_TRACK_LANGUAGE_ID = 0x22B59C
|
|
MATROSKA_TRACK_OFFSET = 0x537F
|
|
MATROSKA_TRACK_FLAG_DEFAULT_ID = 0x88
|
|
MATROSKA_TRACK_FLAG_ENABLED_ID = 0xB9
|
|
MATROSKA_TITLE_ID = 0x7BA9
|
|
MATROSKA_DATE_UTC_ID = 0x4461
|
|
MATROSKA_NAME_ID = 0x536E
|
|
|
|
MATROSKA_CHAPTERS_ID = 0x1043A770
|
|
MATROSKA_CHAPTER_UID_ID = 0x73C4
|
|
MATROSKA_EDITION_ENTRY_ID = 0x45B9
|
|
MATROSKA_CHAPTER_ATOM_ID = 0xB6
|
|
MATROSKA_CHAPTER_TIME_START_ID = 0x91
|
|
MATROSKA_CHAPTER_TIME_END_ID = 0x92
|
|
MATROSKA_CHAPTER_FLAG_ENABLED_ID = 0x4598
|
|
MATROSKA_CHAPTER_DISPLAY_ID = 0x80
|
|
MATROSKA_CHAPTER_LANGUAGE_ID = 0x437C
|
|
MATROSKA_CHAPTER_STRING_ID = 0x85
|
|
|
|
MATROSKA_ATTACHMENTS_ID = 0x1941A469
|
|
MATROSKA_ATTACHED_FILE_ID = 0x61A7
|
|
MATROSKA_FILE_DESC_ID = 0x467E
|
|
MATROSKA_FILE_NAME_ID = 0x466E
|
|
MATROSKA_FILE_MIME_TYPE_ID = 0x4660
|
|
MATROSKA_FILE_DATA_ID = 0x465C
|
|
|
|
MATROSKA_SEEKHEAD_ID = 0x114D9B74
|
|
MATROSKA_SEEK_ID = 0x4DBB
|
|
MATROSKA_SEEKID_ID = 0x53AB
|
|
MATROSKA_SEEK_POSITION_ID = 0x53AC
|
|
|
|
MATROSKA_TAGS_ID = 0x1254C367
|
|
MATROSKA_TAG_ID = 0x7373
|
|
MATROSKA_TARGETS_ID = 0x63C0
|
|
MATROSKA_TARGET_TYPE_VALUE_ID = 0x68CA
|
|
MATROSKA_TARGET_TYPE_ID = 0x63CA
|
|
MATRSOKA_TAGS_TRACK_UID_ID = 0x63C5
|
|
MATRSOKA_TAGS_EDITION_UID_ID = 0x63C9
|
|
MATRSOKA_TAGS_CHAPTER_UID_ID = 0x63C4
|
|
MATRSOKA_TAGS_ATTACHMENT_UID_ID = 0x63C6
|
|
MATROSKA_SIMPLE_TAG_ID = 0x67C8
|
|
MATROSKA_TAG_NAME_ID = 0x45A3
|
|
MATROSKA_TAG_LANGUAGE_ID = 0x447A
|
|
MATROSKA_TAG_STRING_ID = 0x4487
|
|
MATROSKA_TAG_BINARY_ID = 0x4485
|
|
|
|
# See mkv spec for details:
|
|
# http://www.matroska.org/technical/specs/index.html
|
|
|
|
# Map to convert to well known codes
|
|
# http://haali.cs.msu.ru/mkv/codecs.pdf
|
|
FOURCCMap = {
|
|
'V_THEORA': 'THEO',
|
|
'V_SNOW': 'SNOW',
|
|
'V_MPEG4/ISO/ASP': 'MP4V',
|
|
'V_MPEG4/ISO/AVC': 'AVC1',
|
|
'A_AC3': 0x2000,
|
|
'A_MPEG/L3': 0x0055,
|
|
'A_MPEG/L2': 0x0050,
|
|
'A_MPEG/L1': 0x0050,
|
|
'A_DTS': 0x2001,
|
|
'A_PCM/INT/LIT': 0x0001,
|
|
'A_PCM/FLOAT/IEEE': 0x003,
|
|
'A_TTA1': 0x77a1,
|
|
'A_WAVPACK4': 0x5756,
|
|
'A_VORBIS': 0x6750,
|
|
'A_FLAC': 0xF1AC,
|
|
'A_AAC': 0x00ff,
|
|
'A_AAC/': 0x00ff
|
|
}
|
|
|
|
|
|
def matroska_date_to_datetime(date):
|
|
"""
|
|
Converts a date in Matroska's date format to a python datetime object.
|
|
Returns the given date string if it could not be converted.
|
|
"""
|
|
# From the specs:
|
|
# The fields with dates should have the following format: YYYY-MM-DD
|
|
# HH:MM:SS.MSS [...] To store less accuracy, you remove items starting
|
|
# from the right. To store only the year, you would use, "2004". To store
|
|
# a specific day such as May 1st, 2003, you would use "2003-05-01".
|
|
fmt = re.split(r'([-:. ])', '%Y-%m-%d %H:%M:%S.%f')
|
|
while fmt:
|
|
try:
|
|
return datetime.strptime(date, ''.join(fmt))
|
|
except ValueError:
|
|
fmt = fmt[:-2]
|
|
return date
|
|
|
|
|
|
def matroska_bps_to_bitrate(bps):
|
|
"""
|
|
Tries to convert a free-form bps string into a bitrate (bits per second).
|
|
"""
|
|
m = re.search(r'([\d.]+)\s*(\D.*)', bps)
|
|
if m:
|
|
bps, suffix = m.groups()
|
|
if 'kbit' in suffix:
|
|
return float(bps) * 1024
|
|
elif 'kbyte' in suffix:
|
|
return float(bps) * 1024 * 8
|
|
elif 'byte' in suffix:
|
|
return float(bps) * 8
|
|
elif 'bps' in suffix or 'bit' in suffix:
|
|
return float(bps)
|
|
if bps.replace('.', '').isdigit():
|
|
if float(bps) < 30000:
|
|
# Assume kilobits and convert to bps
|
|
return float(bps) * 1024
|
|
return float(bps)
|
|
|
|
|
|
# Used to convert the official matroska tag names (only lower-cased) to core
|
|
# attributes. tag name -> attr, filter
|
|
TAGS_MAP = {
|
|
# From Media core
|
|
u'title': ('title', None),
|
|
u'subtitle': ('caption', None),
|
|
u'comment': ('comment', None),
|
|
u'url': ('url', None),
|
|
u'artist': ('artist', None),
|
|
u'keywords': ('keywords', lambda s: [word.strip() for word in s.split(',')]),
|
|
u'composer_nationality': ('country', None),
|
|
u'date_released': ('datetime', None),
|
|
u'date_recorded': ('datetime', None),
|
|
u'date_written': ('datetime', None),
|
|
|
|
# From Video core
|
|
u'encoder': ('encoder', None),
|
|
u'bps': ('bitrate', matroska_bps_to_bitrate),
|
|
u'part_number': ('trackno', int),
|
|
u'total_parts': ('trackof', int),
|
|
u'copyright': ('copyright', None),
|
|
u'genre': ('genre', None),
|
|
u'actor': ('actors', None),
|
|
u'written_by': ('writer', None),
|
|
u'producer': ('producer', None),
|
|
u'production_studio': ('studio', None),
|
|
u'law_rating': ('rating', None),
|
|
u'summary': ('summary', None),
|
|
u'synopsis': ('synopsis', None),
|
|
}
|
|
|
|
|
|
class EbmlEntity:
|
|
"""
|
|
This is class that is responsible to handle one Ebml entity as described in
|
|
the Matroska/Ebml spec
|
|
"""
|
|
|
|
def __init__(self, inbuf):
|
|
# Compute the EBML id
|
|
# Set the CRC len to zero
|
|
self.crc_len = 0
|
|
self.entity_data = None
|
|
self.ebml_length = None
|
|
self.len_size = None
|
|
self.entity_len = None
|
|
self.value = None
|
|
self.id_len = None
|
|
self.entity_id = None
|
|
self.entity_str = None
|
|
|
|
# Now loop until we find an entity without CRC
|
|
try:
|
|
self.build_entity(inbuf)
|
|
except IndexError:
|
|
raise ParseError()
|
|
while self.get_id() == MATROSKA_CRC32_ID:
|
|
self.crc_len += self.get_total_len()
|
|
inbuf = inbuf[self.get_total_len():]
|
|
self.build_entity(inbuf)
|
|
|
|
def build_entity(self, inbuf):
|
|
self.compute_id(inbuf)
|
|
|
|
if self.id_len == 0:
|
|
log.error(u'EBML entity not found, bad file format')
|
|
raise ParseError()
|
|
|
|
self.entity_len, self.len_size = self.compute_len(inbuf[self.id_len:])
|
|
self.entity_data = inbuf[self.get_header_len(): self.get_total_len()]
|
|
self.ebml_length = self.entity_len
|
|
self.entity_len = min(len(self.entity_data), self.entity_len)
|
|
|
|
# if the data size is 8 or less, it could be a numeric value
|
|
self.value = 0
|
|
if self.entity_len <= 8:
|
|
for pos, shift in zip(range(self.entity_len), range((self.entity_len - 1) * 8, -1, -8)):
|
|
self.value |= indexbytes(self.entity_data, pos) << shift
|
|
|
|
def add_data(self, data):
|
|
maxlen = self.ebml_length - len(self.entity_data)
|
|
if maxlen <= 0:
|
|
return
|
|
self.entity_data += data[:maxlen]
|
|
self.entity_len = len(self.entity_data)
|
|
|
|
def compute_id(self, inbuf):
|
|
self.id_len = 0
|
|
if len(inbuf) < 1:
|
|
return 0
|
|
first = byte2int(inbuf)
|
|
if first & 0x80:
|
|
self.id_len = 1
|
|
self.entity_id = first
|
|
elif first & 0x40:
|
|
if len(inbuf) < 2:
|
|
return 0
|
|
self.id_len = 2
|
|
self.entity_id = byte2int(inbuf) << 8 | indexbytes(inbuf, 1)
|
|
elif first & 0x20:
|
|
if len(inbuf) < 3:
|
|
return 0
|
|
self.id_len = 3
|
|
self.entity_id = (byte2int(inbuf) << 16) | (indexbytes(inbuf, 1) << 8) | \
|
|
(indexbytes(inbuf, 2))
|
|
elif first & 0x10:
|
|
if len(inbuf) < 4:
|
|
return 0
|
|
self.id_len = 4
|
|
self.entity_id = (byte2int(inbuf) << 24) | (indexbytes(inbuf, 1) << 16) | \
|
|
(indexbytes(inbuf, 2) << 8) | (indexbytes(inbuf, 3))
|
|
self.entity_str = inbuf[0:self.id_len]
|
|
|
|
@staticmethod
|
|
def compute_len(inbuf):
|
|
if not inbuf:
|
|
return 0, 0
|
|
i = num_ffs = 0
|
|
len_mask = 0x80
|
|
length = byte2int(inbuf)
|
|
while not length & len_mask:
|
|
i += 1
|
|
len_mask >>= 1
|
|
if i >= 8:
|
|
return 0, 0
|
|
|
|
length &= len_mask - 1
|
|
if length == len_mask - 1:
|
|
num_ffs += 1
|
|
for p in range(i):
|
|
length = (length << 8) | indexbytes(inbuf, p + 1)
|
|
if length & 0xff == 0xff:
|
|
num_ffs += 1
|
|
if num_ffs == i + 1:
|
|
length = 0
|
|
return length, i + 1
|
|
|
|
def get_crc_len(self):
|
|
return self.crc_len
|
|
|
|
def get_value(self):
|
|
return self.value
|
|
|
|
def get_float_value(self):
|
|
if len(self.entity_data) == 4:
|
|
return unpack('!f', self.entity_data)[0]
|
|
elif len(self.entity_data) == 8:
|
|
return unpack('!d', self.entity_data)[0]
|
|
return 0.0
|
|
|
|
def get_data(self):
|
|
return self.entity_data
|
|
|
|
def get_utf8(self):
|
|
return decode_str(self.entity_data, 'utf-8', 'replace')
|
|
|
|
def get_str(self):
|
|
return decode_str(self.entity_data, 'ascii', 'replace')
|
|
|
|
def get_id(self):
|
|
return self.entity_id
|
|
|
|
def get_str_id(self):
|
|
return self.entity_str
|
|
|
|
def get_len(self):
|
|
return self.entity_len
|
|
|
|
def get_total_len(self):
|
|
return self.entity_len + self.id_len + self.len_size
|
|
|
|
def get_header_len(self):
|
|
return self.id_len + self.len_size
|
|
|
|
|
|
class Matroska(core.AVContainer):
|
|
"""
|
|
Matroska video and audio parser. If at least one video stream is
|
|
detected it will set the type to MEDIA_AV.
|
|
"""
|
|
|
|
def __init__(self, file):
|
|
core.AVContainer.__init__(self)
|
|
self.samplerate = 1
|
|
self.title = None
|
|
self.timestamp = None
|
|
self.length = None
|
|
self.thumbnail = None
|
|
self.season = None
|
|
self.episode = None
|
|
self.trackno = None
|
|
self.series = None
|
|
|
|
self.file = file
|
|
# Read enough that we're likely to get the full seekhead (FIXME: kludge)
|
|
buffer = file.read(2000)
|
|
if len(buffer) == 0:
|
|
# Regular File end
|
|
raise ParseError()
|
|
|
|
# Check the Matroska header
|
|
header = EbmlEntity(buffer)
|
|
if header.get_id() != MATROSKA_HEADER_ID:
|
|
raise ParseError()
|
|
|
|
log.debug(u'HEADER ID found %08X' % header.get_id())
|
|
self.mime = 'video/x-matroska'
|
|
self.type = 'Matroska'
|
|
self.has_idx = False
|
|
self.objects_by_uid = {}
|
|
|
|
# Now get the segment
|
|
self.segment = segment = EbmlEntity(buffer[header.get_total_len():])
|
|
# Record file offset of segment data for seekheads
|
|
self.segment.offset = header.get_total_len() + segment.get_header_len()
|
|
if segment.get_id() != MATROSKA_SEGMENT_ID:
|
|
log.debug(u'SEGMENT ID not found %08X' % segment.get_id())
|
|
return
|
|
|
|
log.debug(u'SEGMENT ID found %08X' % segment.get_id())
|
|
try:
|
|
for elem in self.process_one_level(segment):
|
|
if elem.get_id() == MATROSKA_SEEKHEAD_ID:
|
|
self.process_elem(elem)
|
|
except ParseError:
|
|
pass
|
|
|
|
if not self.has_idx:
|
|
log.warning(u'File has no index')
|
|
self._set('corrupt', True)
|
|
|
|
def process_elem(self, elem):
|
|
elem_id = elem.get_id()
|
|
log.debug(u'BEGIN: process element %r' % hex(elem_id))
|
|
if elem_id == MATROSKA_SEGMENT_INFO_ID:
|
|
duration = 0
|
|
scalecode = 1000000.0
|
|
|
|
for ielem in self.process_one_level(elem):
|
|
ielem_id = ielem.get_id()
|
|
if ielem_id == MATROSKA_TIMECODESCALE_ID:
|
|
scalecode = ielem.get_value()
|
|
elif ielem_id == MATROSKA_DURATION_ID:
|
|
duration = ielem.get_float_value()
|
|
elif ielem_id == MATROSKA_TITLE_ID:
|
|
self.title = ielem.get_utf8()
|
|
elif ielem_id == MATROSKA_DATE_UTC_ID:
|
|
timestamp = unpack('!q', ielem.get_data())[0] / 10.0 ** 9
|
|
# Date is offset 2001-01-01 00:00:00 (timestamp 978307200.0)
|
|
self.timestamp = int(timestamp + 978307200)
|
|
|
|
self.length = duration * scalecode / 1000000000.0
|
|
|
|
elif elem_id == MATROSKA_TRACKS_ID:
|
|
self.process_tracks(elem)
|
|
|
|
elif elem_id == MATROSKA_CHAPTERS_ID:
|
|
self.process_chapters(elem)
|
|
|
|
elif elem_id == MATROSKA_ATTACHMENTS_ID:
|
|
self.process_attachments(elem)
|
|
|
|
elif elem_id == MATROSKA_SEEKHEAD_ID:
|
|
self.process_seekhead(elem)
|
|
|
|
elif elem_id == MATROSKA_TAGS_ID:
|
|
self.process_tags(elem)
|
|
|
|
elif elem_id == MATROSKA_CUES_ID:
|
|
self.has_idx = True
|
|
|
|
log.debug(u'END: process element %r' % hex(elem_id))
|
|
return True
|
|
|
|
def process_seekhead(self, elem):
|
|
for seek_elem in self.process_one_level(elem):
|
|
if seek_elem.get_id() != MATROSKA_SEEK_ID:
|
|
continue
|
|
for sub_elem in self.process_one_level(seek_elem):
|
|
if sub_elem.get_id() == MATROSKA_SEEKID_ID:
|
|
if sub_elem.get_value() == MATROSKA_CLUSTER_ID:
|
|
# Not interested in these.
|
|
return
|
|
|
|
elif sub_elem.get_id() == MATROSKA_SEEK_POSITION_ID:
|
|
self.file.seek(self.segment.offset + sub_elem.get_value())
|
|
buffer = self.file.read(100)
|
|
try:
|
|
elem = EbmlEntity(buffer)
|
|
except ParseError:
|
|
continue
|
|
|
|
# Fetch all data necessary for this element.
|
|
elem.add_data(self.file.read(elem.ebml_length))
|
|
self.process_elem(elem)
|
|
|
|
def process_tracks(self, tracks):
|
|
tracksbuf = tracks.get_data()
|
|
index = 0
|
|
while index < tracks.get_len():
|
|
trackelem = EbmlEntity(tracksbuf[index:])
|
|
log.debug(u'ELEMENT %X found' % trackelem.get_id())
|
|
self.process_track(trackelem)
|
|
index += trackelem.get_total_len() + trackelem.get_crc_len()
|
|
|
|
@staticmethod
|
|
def process_one_level(item):
|
|
buf = item.get_data()
|
|
index = 0
|
|
while index < item.get_len():
|
|
if len(buf[index:]) == 0:
|
|
break
|
|
elem = EbmlEntity(buf[index:])
|
|
yield elem
|
|
index += elem.get_total_len() + elem.get_crc_len()
|
|
|
|
@staticmethod
|
|
def set_track_defaults(track):
|
|
track.language = 'eng'
|
|
|
|
def process_track(self, track):
|
|
# Collapse generator into a list since we need to iterate over it
|
|
# twice.
|
|
elements = [x for x in self.process_one_level(track)]
|
|
track_type = [x.get_value() for x in elements if x.get_id() == MATROSKA_TRACK_TYPE_ID]
|
|
if not track_type:
|
|
log.debug(u'Bad track: no type id found')
|
|
return
|
|
|
|
track_type = track_type[0]
|
|
# track = None
|
|
|
|
if track_type == MATROSKA_VIDEO_TRACK:
|
|
log.debug(u'Video track found')
|
|
# track = self.process_video_track(elements)
|
|
elif track_type == MATROSKA_AUDIO_TRACK:
|
|
log.debug(u'Audio track found')
|
|
# track = self.process_audio_track(elements)
|
|
elif track_type == MATROSKA_SUBTITLES_TRACK:
|
|
log.debug(u'Subtitle track found')
|
|
track = core.Subtitle()
|
|
self.set_track_defaults(track)
|
|
track.id = len(self.subtitles)
|
|
self.subtitles.append(track)
|
|
for elem in elements:
|
|
self.process_track_common(elem, track)
|
|
|
|
def process_track_common(self, elem, track):
|
|
elem_id = elem.get_id()
|
|
if elem_id == MATROSKA_TRACK_LANGUAGE_ID:
|
|
track.language = elem.get_str()
|
|
log.debug(u'Track language found: %r' % track.language)
|
|
elif elem_id == MATROSKA_NAME_ID:
|
|
track.title = elem.get_utf8()
|
|
elif elem_id == MATROSKA_TRACK_NUMBER_ID:
|
|
track.trackno = elem.get_value()
|
|
elif elem_id == MATROSKA_TRACK_FLAG_ENABLED_ID:
|
|
track.enabled = bool(elem.get_value())
|
|
elif elem_id == MATROSKA_TRACK_FLAG_DEFAULT_ID:
|
|
track.default = bool(elem.get_value())
|
|
elif elem_id == MATROSKA_CODEC_ID:
|
|
track.codec = elem.get_str()
|
|
elif elem_id == MATROSKA_CODEC_PRIVATE_ID:
|
|
track.codec_private = elem.get_data()
|
|
elif elem_id == MATROSKA_TRACK_UID_ID:
|
|
self.objects_by_uid[elem.get_value()] = track
|
|
|
|
def process_video_track(self, elements):
|
|
track = core.VideoStream()
|
|
# Defaults
|
|
track.codec = u'Unknown'
|
|
track.fps = 0
|
|
self.set_track_defaults(track)
|
|
|
|
for elem in elements:
|
|
elem_id = elem.get_id()
|
|
if elem_id == MATROSKA_CODEC_ID:
|
|
track.codec = elem.get_str()
|
|
|
|
elif elem_id == MATROSKA_FRAME_DURATION_ID:
|
|
try:
|
|
track.fps = 1 / (pow(10, -9) * (elem.get_value()))
|
|
except ZeroDivisionError:
|
|
pass
|
|
|
|
elif elem_id == MATROSKA_VIDEO_SETTINGS_ID:
|
|
d_width = d_height = None
|
|
for settings_elem in self.process_one_level(elem):
|
|
settings_elem_id = settings_elem.get_id()
|
|
if settings_elem_id == MATROSKA_VIDEO_WIDTH_ID:
|
|
track.width = settings_elem.get_value()
|
|
elif settings_elem_id == MATROSKA_VIDEO_HEIGHT_ID:
|
|
track.height = settings_elem.get_value()
|
|
elif settings_elem_id == MATROSKA_VIDEO_DISPLAY_WIDTH_ID:
|
|
d_width = settings_elem.get_value()
|
|
elif settings_elem_id == MATROSKA_VIDEO_DISPLAY_HEIGHT_ID:
|
|
d_height = settings_elem.get_value()
|
|
elif settings_elem_id == MATROSKA_VIDEO_INTERLACED_ID:
|
|
value = int(settings_elem.get_value())
|
|
self._set('interlaced', value)
|
|
|
|
if None not in [d_width, d_height]:
|
|
track.aspect = float(d_width) / d_height
|
|
|
|
else:
|
|
self.process_track_common(elem, track)
|
|
|
|
# convert codec information
|
|
# http://haali.cs.msu.ru/mkv/codecs.pdf
|
|
if track.codec in FOURCCMap:
|
|
track.codec = FOURCCMap[track.codec]
|
|
elif '/' in track.codec and track.codec.split('/')[0] + '/' in FOURCCMap:
|
|
track.codec = FOURCCMap[track.codec.split('/')[0] + '/']
|
|
elif track.codec.endswith('FOURCC') and len(track.codec_private or '') == 40:
|
|
track.codec = track.codec_private[16:20]
|
|
elif track.codec.startswith('V_REAL/'):
|
|
track.codec = track.codec[7:]
|
|
elif track.codec.startswith('V_'):
|
|
# FIXME: add more video codecs here
|
|
track.codec = track.codec[2:]
|
|
|
|
track.id = len(self.video)
|
|
self.video.append(track)
|
|
return track
|
|
|
|
def process_audio_track(self, elements):
|
|
track = core.AudioStream()
|
|
track.codec = u'Unknown'
|
|
self.set_track_defaults(track)
|
|
|
|
for elem in elements:
|
|
elem_id = elem.get_id()
|
|
if elem_id == MATROSKA_CODEC_ID:
|
|
track.codec = elem.get_str()
|
|
elif elem_id == MATROSKA_AUDIO_SETTINGS_ID:
|
|
for settings_elem in self.process_one_level(elem):
|
|
settings_elem_id = settings_elem.get_id()
|
|
if settings_elem_id == MATROSKA_AUDIO_SAMPLERATE_ID:
|
|
track.samplerate = settings_elem.get_float_value()
|
|
elif settings_elem_id == MATROSKA_AUDIO_CHANNELS_ID:
|
|
track.channels = settings_elem.get_value()
|
|
else:
|
|
self.process_track_common(elem, track)
|
|
|
|
if track.codec in FOURCCMap:
|
|
track.codec = FOURCCMap[track.codec]
|
|
elif '/' in track.codec and track.codec.split('/')[0] + '/' in FOURCCMap:
|
|
track.codec = FOURCCMap[track.codec.split('/')[0] + '/']
|
|
elif track.codec.startswith('A_'):
|
|
track.codec = track.codec[2:]
|
|
|
|
track.id = len(self.audio)
|
|
self.audio.append(track)
|
|
return track
|
|
|
|
def process_chapters(self, chapters):
|
|
elements = self.process_one_level(chapters)
|
|
for elem in elements:
|
|
if elem.get_id() == MATROSKA_EDITION_ENTRY_ID:
|
|
buf = elem.get_data()
|
|
index = 0
|
|
while index < elem.get_len():
|
|
sub_elem = EbmlEntity(buf[index:])
|
|
if sub_elem.get_id() == MATROSKA_CHAPTER_ATOM_ID:
|
|
self.process_chapter_atom(sub_elem)
|
|
index += sub_elem.get_total_len() + sub_elem.get_crc_len()
|
|
|
|
def process_chapter_atom(self, atom):
|
|
elements = self.process_one_level(atom)
|
|
chap = core.Chapter()
|
|
|
|
for elem in elements:
|
|
elem_id = elem.get_id()
|
|
if elem_id == MATROSKA_CHAPTER_TIME_START_ID:
|
|
# Scale timecode to seconds (float)
|
|
chap.pos = elem.get_value() / 1000000 / 1000.0
|
|
elif elem_id == MATROSKA_CHAPTER_FLAG_ENABLED_ID:
|
|
chap.enabled = elem.get_value()
|
|
elif elem_id == MATROSKA_CHAPTER_DISPLAY_ID:
|
|
# Matroska supports multiple (chapter name, language) pairs for
|
|
# each chapter, so chapter names can be internationalized. This
|
|
# logic will only take the last one in the list.
|
|
for display_elem in self.process_one_level(elem):
|
|
if display_elem.get_id() == MATROSKA_CHAPTER_STRING_ID:
|
|
chap.name = display_elem.get_utf8()
|
|
elif elem_id == MATROSKA_CHAPTER_UID_ID:
|
|
self.objects_by_uid[elem.get_value()] = chap
|
|
|
|
log.debug(u'Chapter %r found', chap.name)
|
|
chap.id = len(self.chapters)
|
|
self.chapters.append(chap)
|
|
|
|
def process_attachments(self, attachments):
|
|
buf = attachments.get_data()
|
|
index = 0
|
|
while index < attachments.get_len():
|
|
elem = EbmlEntity(buf[index:])
|
|
if elem.get_id() == MATROSKA_ATTACHED_FILE_ID:
|
|
self.process_attachment(elem)
|
|
index += elem.get_total_len() + elem.get_crc_len()
|
|
|
|
def process_attachment(self, attachment):
|
|
elements = self.process_one_level(attachment)
|
|
name = desc = mimetype = ""
|
|
data = None
|
|
|
|
for elem in elements:
|
|
elem_id = elem.get_id()
|
|
if elem_id == MATROSKA_FILE_NAME_ID:
|
|
name = elem.get_utf8()
|
|
elif elem_id == MATROSKA_FILE_DESC_ID:
|
|
desc = elem.get_utf8()
|
|
elif elem_id == MATROSKA_FILE_MIME_TYPE_ID:
|
|
mimetype = elem.get_data()
|
|
elif elem_id == MATROSKA_FILE_DATA_ID:
|
|
data = elem.get_data()
|
|
|
|
# Right now we only support attachments that could be cover images.
|
|
# Make a guess to see if this attachment is a cover image.
|
|
if mimetype.startswith("image/") and u"cover" in (name + desc).lower() and data:
|
|
self.thumbnail = data
|
|
|
|
log.debug(u'Attachment %r found' % name)
|
|
|
|
def process_tags(self, tags):
|
|
# Tags spec: http://www.matroska.org/technical/specs/tagging/index.html
|
|
# Iterate over Tags children. Tags element children is a
|
|
# Tag element (whose children are SimpleTags) and a Targets element
|
|
# whose children specific what objects the tags apply to.
|
|
for tag_elem in self.process_one_level(tags):
|
|
# Start a new dict to hold all SimpleTag elements.
|
|
tags_dict = core.Tags()
|
|
# A list of target uids this tags dict applies too. If empty,
|
|
# tags are global.
|
|
targets = []
|
|
for sub_elem in self.process_one_level(tag_elem):
|
|
if sub_elem.get_id() == MATROSKA_SIMPLE_TAG_ID:
|
|
self.process_simple_tag(sub_elem, tags_dict)
|
|
elif sub_elem.get_id() == MATROSKA_TARGETS_ID:
|
|
# Targets element: if there is no uid child (track uid,
|
|
# chapter uid, etc.) then the tags dict applies to the
|
|
# whole file (top-level Media object).
|
|
for target_elem in self.process_one_level(sub_elem):
|
|
target_elem_id = target_elem.get_id()
|
|
if target_elem_id in (MATRSOKA_TAGS_TRACK_UID_ID, MATRSOKA_TAGS_EDITION_UID_ID,
|
|
MATRSOKA_TAGS_CHAPTER_UID_ID, MATRSOKA_TAGS_ATTACHMENT_UID_ID):
|
|
targets.append(target_elem.get_value())
|
|
elif target_elem_id == MATROSKA_TARGET_TYPE_VALUE_ID:
|
|
# Target types not supported for now. (Unclear how this
|
|
# would fit with kaa.metadata.)
|
|
pass
|
|
if targets:
|
|
# Assign tags to all listed uids
|
|
for target in targets:
|
|
try:
|
|
self.objects_by_uid[target].tags.update(tags_dict)
|
|
self.tags_to_attributes(self.objects_by_uid[target], tags_dict)
|
|
except KeyError:
|
|
log.warning(u'Tags assigned to unknown/unsupported target uid %d', target)
|
|
else:
|
|
self.tags.update(tags_dict)
|
|
self.tags_to_attributes(self, tags_dict)
|
|
|
|
def process_simple_tag(self, simple_tag_elem, tags_dict):
|
|
"""
|
|
Returns a dict representing the Tag element.
|
|
"""
|
|
name = lang = value = children = None
|
|
binary = False
|
|
for elem in self.process_one_level(simple_tag_elem):
|
|
elem_id = elem.get_id()
|
|
if elem_id == MATROSKA_TAG_NAME_ID:
|
|
name = elem.get_utf8().lower()
|
|
elif elem_id == MATROSKA_TAG_STRING_ID:
|
|
value = elem.get_utf8()
|
|
elif elem_id == MATROSKA_TAG_BINARY_ID:
|
|
value = elem.get_data()
|
|
binary = True
|
|
elif elem_id == MATROSKA_TAG_LANGUAGE_ID:
|
|
lang = elem.get_utf8()
|
|
elif elem_id == MATROSKA_SIMPLE_TAG_ID:
|
|
if children is None:
|
|
children = core.Tags()
|
|
self.process_simple_tag(elem, children)
|
|
|
|
if children:
|
|
# Convert ourselves to a Tags object.
|
|
children.value = value
|
|
children.langcode = lang
|
|
value = children
|
|
else:
|
|
if name.startswith('date_'):
|
|
# Try to convert date to a datetime object.
|
|
value = matroska_date_to_datetime(value)
|
|
value = core.Tag(value, lang, binary)
|
|
|
|
if name in tags_dict:
|
|
# Multiple items of this tag name.
|
|
if not isinstance(tags_dict[name], list):
|
|
# Convert to a list
|
|
tags_dict[name] = [tags_dict[name]]
|
|
# Append to list
|
|
tags_dict[name].append(value)
|
|
else:
|
|
tags_dict[name] = value
|
|
|
|
def tags_to_attributes(self, obj, tags):
|
|
# Convert tags to core attributes.
|
|
for name, tag in tags.items():
|
|
if isinstance(tag, dict):
|
|
# Nested tags dict, recurse.
|
|
self.tags_to_attributes(obj, tag)
|
|
continue
|
|
elif name not in TAGS_MAP:
|
|
continue
|
|
|
|
attr, f = TAGS_MAP[name]
|
|
if attr not in obj._keys and attr not in self._keys:
|
|
# Tag is not in any core attribute for this object or global,
|
|
# so skip.
|
|
continue
|
|
|
|
# Pull value out of Tag object or list of Tag objects.
|
|
value = [item.value for item in tag] if isinstance(tag, list) else tag.value
|
|
if f:
|
|
try:
|
|
value = [f(item) for item in value] if isinstance(value, list) else f(value)
|
|
except Exception as e:
|
|
log.warning(u'Failed to convert tag to core attribute: %r', e)
|
|
# Special handling for tv series recordings. The 'title' tag
|
|
# can be used for both the series and the episode name. The
|
|
# same is true for trackno which may refer to the season
|
|
# and the episode number. Therefore, if we find these
|
|
# attributes already set we try some guessing.
|
|
if attr == 'trackno' and getattr(self, attr) is not None:
|
|
# delete trackno and save season and episode
|
|
self.season = self.trackno
|
|
self.episode = value
|
|
self.trackno = None
|
|
continue
|
|
if attr == 'title' and getattr(self, attr) is not None:
|
|
# store current value of title as series and use current
|
|
# value of title as title
|
|
self.series = self.title
|
|
if attr in obj._keys:
|
|
setattr(obj, attr, value)
|
|
else:
|
|
setattr(self, attr, value)
|
|
|
|
|
|
Parser = Matroska
|