Merge pull request #6428 from dstftw/improve-generic-smil-support
Improve generic SMIL support
This commit is contained in:
commit
d5d7bdaeb5
|
@ -133,8 +133,8 @@ def expect_info_dict(self, got_dict, expected_dict):
|
||||||
elif isinstance(expected, compat_str) and expected.startswith('mincount:'):
|
elif isinstance(expected, compat_str) and expected.startswith('mincount:'):
|
||||||
got = got_dict.get(info_field)
|
got = got_dict.get(info_field)
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
isinstance(got, list),
|
isinstance(got, (list, dict)),
|
||||||
'Expected field %s to be a list, but it is of type %s' % (
|
'Expected field %s to be a list or a dict, but it is of type %s' % (
|
||||||
info_field, type(got).__name__))
|
info_field, type(got).__name__))
|
||||||
expected_num = int(expected.partition(':')[2])
|
expected_num = int(expected.partition(':')[2])
|
||||||
assertGreaterEqual(
|
assertGreaterEqual(
|
||||||
|
|
|
@ -136,7 +136,9 @@ def generator(test_case):
|
||||||
# We're not using .download here sine that is just a shim
|
# We're not using .download here sine that is just a shim
|
||||||
# for outside error handling, and returns the exit code
|
# for outside error handling, and returns the exit code
|
||||||
# instead of the result dict.
|
# instead of the result dict.
|
||||||
res_dict = ydl.extract_info(test_case['url'])
|
res_dict = ydl.extract_info(
|
||||||
|
test_case['url'],
|
||||||
|
force_generic_extractor=params.get('force_generic_extractor', False))
|
||||||
except (DownloadError, ExtractorError) as err:
|
except (DownloadError, ExtractorError) as err:
|
||||||
# Check if the exception is not a network related one
|
# Check if the exception is not a network related one
|
||||||
if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503):
|
if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503):
|
||||||
|
|
|
@ -18,6 +18,7 @@ from ..compat import (
|
||||||
compat_HTTPError,
|
compat_HTTPError,
|
||||||
compat_http_client,
|
compat_http_client,
|
||||||
compat_urllib_error,
|
compat_urllib_error,
|
||||||
|
compat_urllib_parse,
|
||||||
compat_urllib_parse_urlparse,
|
compat_urllib_parse_urlparse,
|
||||||
compat_urllib_request,
|
compat_urllib_request,
|
||||||
compat_urlparse,
|
compat_urlparse,
|
||||||
|
@ -37,6 +38,7 @@ from ..utils import (
|
||||||
RegexNotFoundError,
|
RegexNotFoundError,
|
||||||
sanitize_filename,
|
sanitize_filename,
|
||||||
unescapeHTML,
|
unescapeHTML,
|
||||||
|
url_basename,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -978,69 +980,167 @@ class InfoExtractor(object):
|
||||||
self._sort_formats(formats)
|
self._sort_formats(formats)
|
||||||
return formats
|
return formats
|
||||||
|
|
||||||
# TODO: improve extraction
|
@staticmethod
|
||||||
def _extract_smil_formats(self, smil_url, video_id, fatal=True):
|
def _xpath_ns(path, namespace=None):
|
||||||
smil = self._download_xml(
|
if not namespace:
|
||||||
smil_url, video_id, 'Downloading SMIL file',
|
return path
|
||||||
'Unable to download SMIL file', fatal=fatal)
|
out = []
|
||||||
|
for c in path.split('/'):
|
||||||
|
if not c or c == '.':
|
||||||
|
out.append(c)
|
||||||
|
else:
|
||||||
|
out.append('{%s}%s' % (namespace, c))
|
||||||
|
return '/'.join(out)
|
||||||
|
|
||||||
|
def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
|
||||||
|
smil = self._download_smil(smil_url, video_id, fatal=fatal)
|
||||||
|
|
||||||
if smil is False:
|
if smil is False:
|
||||||
assert not fatal
|
assert not fatal
|
||||||
return []
|
return []
|
||||||
|
|
||||||
base = smil.find('./head/meta').get('base')
|
namespace = self._parse_smil_namespace(smil)
|
||||||
|
|
||||||
|
return self._parse_smil_formats(
|
||||||
|
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
|
||||||
|
|
||||||
|
def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
|
||||||
|
smil = self._download_smil(smil_url, video_id, fatal=fatal)
|
||||||
|
if smil is False:
|
||||||
|
return {}
|
||||||
|
return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
|
||||||
|
|
||||||
|
def _download_smil(self, smil_url, video_id, fatal=True):
|
||||||
|
return self._download_xml(
|
||||||
|
smil_url, video_id, 'Downloading SMIL file',
|
||||||
|
'Unable to download SMIL file', fatal=fatal)
|
||||||
|
|
||||||
|
def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
|
||||||
|
namespace = self._parse_smil_namespace(smil)
|
||||||
|
|
||||||
|
formats = self._parse_smil_formats(
|
||||||
|
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
|
||||||
|
subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
|
||||||
|
|
||||||
|
video_id = os.path.splitext(url_basename(smil_url))[0]
|
||||||
|
title = None
|
||||||
|
description = None
|
||||||
|
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
|
||||||
|
name = meta.attrib.get('name')
|
||||||
|
content = meta.attrib.get('content')
|
||||||
|
if not name or not content:
|
||||||
|
continue
|
||||||
|
if not title and name == 'title':
|
||||||
|
title = content
|
||||||
|
elif not description and name in ('description', 'abstract'):
|
||||||
|
description = content
|
||||||
|
|
||||||
|
return {
|
||||||
|
'id': video_id,
|
||||||
|
'title': title or video_id,
|
||||||
|
'description': description,
|
||||||
|
'formats': formats,
|
||||||
|
'subtitles': subtitles,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _parse_smil_namespace(self, smil):
|
||||||
|
return self._search_regex(
|
||||||
|
r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
|
||||||
|
|
||||||
|
def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
|
||||||
|
base = smil_url
|
||||||
|
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
|
||||||
|
b = meta.get('base') or meta.get('httpBase')
|
||||||
|
if b:
|
||||||
|
base = b
|
||||||
|
break
|
||||||
|
|
||||||
formats = []
|
formats = []
|
||||||
rtmp_count = 0
|
rtmp_count = 0
|
||||||
if smil.findall('./body/seq/video'):
|
http_count = 0
|
||||||
video = smil.findall('./body/seq/video')[0]
|
|
||||||
fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
|
|
||||||
formats.extend(fmts)
|
|
||||||
else:
|
|
||||||
for video in smil.findall('./body/switch/video'):
|
|
||||||
fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
|
|
||||||
formats.extend(fmts)
|
|
||||||
|
|
||||||
self._sort_formats(formats)
|
videos = smil.findall(self._xpath_ns('.//video', namespace))
|
||||||
|
for video in videos:
|
||||||
return formats
|
|
||||||
|
|
||||||
def _parse_smil_video(self, video, video_id, base, rtmp_count):
|
|
||||||
src = video.get('src')
|
src = video.get('src')
|
||||||
if not src:
|
if not src:
|
||||||
return [], rtmp_count
|
continue
|
||||||
|
|
||||||
bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
|
bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
|
||||||
|
filesize = int_or_none(video.get('size') or video.get('fileSize'))
|
||||||
width = int_or_none(video.get('width'))
|
width = int_or_none(video.get('width'))
|
||||||
height = int_or_none(video.get('height'))
|
height = int_or_none(video.get('height'))
|
||||||
proto = video.get('proto')
|
proto = video.get('proto')
|
||||||
if not proto:
|
|
||||||
if base:
|
|
||||||
if base.startswith('rtmp'):
|
|
||||||
proto = 'rtmp'
|
|
||||||
elif base.startswith('http'):
|
|
||||||
proto = 'http'
|
|
||||||
ext = video.get('ext')
|
ext = video.get('ext')
|
||||||
if proto == 'm3u8':
|
src_ext = determine_ext(src)
|
||||||
return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
|
|
||||||
elif proto == 'rtmp':
|
|
||||||
rtmp_count += 1
|
|
||||||
streamer = video.get('streamer') or base
|
streamer = video.get('streamer') or base
|
||||||
return ([{
|
|
||||||
|
if proto == 'rtmp' or streamer.startswith('rtmp'):
|
||||||
|
rtmp_count += 1
|
||||||
|
formats.append({
|
||||||
'url': streamer,
|
'url': streamer,
|
||||||
'play_path': src,
|
'play_path': src,
|
||||||
'ext': 'flv',
|
'ext': 'flv',
|
||||||
'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
|
'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
|
||||||
'tbr': bitrate,
|
'tbr': bitrate,
|
||||||
|
'filesize': filesize,
|
||||||
'width': width,
|
'width': width,
|
||||||
'height': height,
|
'height': height,
|
||||||
}], rtmp_count)
|
})
|
||||||
elif proto.startswith('http'):
|
continue
|
||||||
return ([{
|
|
||||||
'url': base + src,
|
src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
|
||||||
'ext': ext or 'flv',
|
|
||||||
|
if proto == 'm3u8' or src_ext == 'm3u8':
|
||||||
|
formats.extend(self._extract_m3u8_formats(
|
||||||
|
src_url, video_id, ext or 'mp4', m3u8_id='hls'))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if src_ext == 'f4m':
|
||||||
|
f4m_url = src_url
|
||||||
|
if not f4m_params:
|
||||||
|
f4m_params = {
|
||||||
|
'hdcore': '3.2.0',
|
||||||
|
'plugin': 'flowplayer-3.2.0.1',
|
||||||
|
}
|
||||||
|
f4m_url += '&' if '?' in f4m_url else '?'
|
||||||
|
f4m_url += compat_urllib_parse.urlencode(f4m_params)
|
||||||
|
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if src_url.startswith('http'):
|
||||||
|
http_count += 1
|
||||||
|
formats.append({
|
||||||
|
'url': src_url,
|
||||||
|
'ext': ext or src_ext or 'flv',
|
||||||
|
'format_id': 'http-%d' % (bitrate or http_count),
|
||||||
'tbr': bitrate,
|
'tbr': bitrate,
|
||||||
|
'filesize': filesize,
|
||||||
'width': width,
|
'width': width,
|
||||||
'height': height,
|
'height': height,
|
||||||
}], rtmp_count)
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
self._sort_formats(formats)
|
||||||
|
|
||||||
|
return formats
|
||||||
|
|
||||||
|
def _parse_smil_subtitles(self, smil, namespace=None):
|
||||||
|
subtitles = {}
|
||||||
|
for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
|
||||||
|
src = textstream.get('src')
|
||||||
|
if not src:
|
||||||
|
continue
|
||||||
|
ext = textstream.get('ext') or determine_ext(src)
|
||||||
|
if not ext:
|
||||||
|
type_ = textstream.get('type')
|
||||||
|
if type_ == 'text/srt':
|
||||||
|
ext = 'srt'
|
||||||
|
lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
|
||||||
|
subtitles.setdefault(lang, []).append({
|
||||||
|
'url': src,
|
||||||
|
'ext': ext,
|
||||||
|
})
|
||||||
|
return subtitles
|
||||||
|
|
||||||
def _live_title(self, name):
|
def _live_title(self, name):
|
||||||
""" Generate the title for a live video """
|
""" Generate the title for a live video """
|
||||||
|
|
|
@ -130,6 +130,74 @@ class GenericIE(InfoExtractor):
|
||||||
'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
|
'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
# SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
|
||||||
|
{
|
||||||
|
'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'smil',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': 'Automatics, robotics and biocybernetics',
|
||||||
|
'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
|
||||||
|
'formats': 'mincount:16',
|
||||||
|
'subtitles': 'mincount:1',
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'force_generic_extractor': True,
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
# SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html
|
||||||
|
{
|
||||||
|
'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'hds',
|
||||||
|
'ext': 'flv',
|
||||||
|
'title': 'hds',
|
||||||
|
'formats': 'mincount:1',
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
# SMIL from https://www.restudy.dk/video/play/id/1637
|
||||||
|
{
|
||||||
|
'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'video_1637',
|
||||||
|
'ext': 'flv',
|
||||||
|
'title': 'video_1637',
|
||||||
|
'formats': 'mincount:3',
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
# SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm
|
||||||
|
{
|
||||||
|
'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil',
|
||||||
|
'info_dict': {
|
||||||
|
'id': 'smil-service',
|
||||||
|
'ext': 'flv',
|
||||||
|
'title': 'smil-service',
|
||||||
|
'formats': 'mincount:1',
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
# SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370
|
||||||
|
{
|
||||||
|
'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil',
|
||||||
|
'info_dict': {
|
||||||
|
'id': '4719370',
|
||||||
|
'ext': 'mp4',
|
||||||
|
'title': '571de1fd-47bc-48db-abf9-238872a58d1f',
|
||||||
|
'formats': 'mincount:3',
|
||||||
|
},
|
||||||
|
'params': {
|
||||||
|
'skip_download': True,
|
||||||
|
},
|
||||||
|
},
|
||||||
# google redirect
|
# google redirect
|
||||||
{
|
{
|
||||||
'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
|
'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
|
||||||
|
@ -1123,11 +1191,13 @@ class GenericIE(InfoExtractor):
|
||||||
|
|
||||||
self.report_extraction(video_id)
|
self.report_extraction(video_id)
|
||||||
|
|
||||||
# Is it an RSS feed?
|
# Is it an RSS feed or a SMIL file?
|
||||||
try:
|
try:
|
||||||
doc = parse_xml(webpage)
|
doc = parse_xml(webpage)
|
||||||
if doc.tag == 'rss':
|
if doc.tag == 'rss':
|
||||||
return self._extract_rss(url, video_id, doc)
|
return self._extract_rss(url, video_id, doc)
|
||||||
|
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
|
||||||
|
return self._parse_smil(doc, url, video_id)
|
||||||
except compat_xml_parse_error:
|
except compat_xml_parse_error:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ from ..utils import (
|
||||||
|
|
||||||
|
|
||||||
class VideoLecturesNetIE(InfoExtractor):
|
class VideoLecturesNetIE(InfoExtractor):
|
||||||
_VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/'
|
_VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)(?:/?[#?].*)?$'
|
||||||
IE_NAME = 'videolectures.net'
|
IE_NAME = 'videolectures.net'
|
||||||
|
|
||||||
_TEST = {
|
_TEST = {
|
||||||
|
|
Loading…
Reference in New Issue