commit 54a5be4dba3560ff51f98865b5598d361a878e82
parent ed6919e7371b3da66c10e4c5768816e81f4c5db3
Author: Remita Amine <remitamine@gmail.com>
Date:   Sat,  1 Sep 2018 08:16:28 +0100

[crunchyroll] parse vilos media data(closes #17343)

Diffstat:
Myoutube_dl/extractor/crunchyroll.py | 205++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Myoutube_dl/extractor/vrv.py | 48++++++++++++++++++++++++++----------------------
2 files changed, 141 insertions(+), 112 deletions(-)

diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py @@ -7,7 +7,7 @@ import zlib from hashlib import sha1 from math import pow, sqrt, floor -from .common import InfoExtractor +from .vrv import VRVIE from ..compat import ( compat_b64decode, compat_etree_fromstring, @@ -18,6 +18,8 @@ from ..compat import ( from ..utils import ( ExtractorError, bytes_to_intlist, + extract_attributes, + float_or_none, intlist_to_bytes, int_or_none, lowercase_escape, @@ -26,14 +28,13 @@ from ..utils import ( unified_strdate, urlencode_postdata, xpath_text, - extract_attributes, ) from ..aes import ( aes_cbc_decrypt, ) -class CrunchyrollBaseIE(InfoExtractor): +class CrunchyrollBaseIE(VRVIE): _LOGIN_URL = 'https://www.crunchyroll.com/login' _LOGIN_FORM = 'login_form' _NETRC_MACHINE = 'crunchyroll' @@ -148,7 +149,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'ext': 'mp4', 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', 'description': 'md5:2d17137920c64f2f49981a7797d275ef', - 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Yomiuri Telecasting Corporation (YTV)', 'upload_date': '20131013', 'url': 're:(?!.*&amp)', @@ -221,7 +222,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'info_dict': { 'id': '535080', 'ext': 'mp4', - 'title': '11eyes Episode 1 – Piros éjszaka - Red Night', + 'title': '11eyes Episode 1 – Red Night ~ Piros éjszaka', 'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".', 'uploader': 'Marvelous AQL Inc.', 'upload_date': '20091021', @@ -437,13 +438,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if 'To view this, please log in to verify you are 18 or older.' in webpage: self.raise_login_required() + media = self._parse_json(self._search_regex( + r'vilos\.config\.media\s*=\s*({.+?});', + webpage, 'vilos media', default='{}'), video_id) + media_metadata = media.get('metadata') or {} + video_title = self._html_search_regex( r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>', webpage, 'video_title') video_title = re.sub(r' {2,}', ' ', video_title) - video_description = self._parse_json(self._html_search_regex( + video_description = (self._parse_json(self._html_search_regex( r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, - webpage, 'description', default='{}'), video_id).get('description') + webpage, 'description', default='{}'), video_id) or media_metadata).get('description') if video_description: video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) video_upload_date = self._html_search_regex( @@ -456,91 +462,99 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], webpage, 'video_uploader', fatal=False) - available_fmts = [] - for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): - attrs = extract_attributes(a) - href = attrs.get('href') - if href and '/freetrial' in href: - continue - available_fmts.append(fmt) - if not available_fmts: - for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): - available_fmts = re.findall(p, webpage) - if available_fmts: - break - video_encode_ids = [] formats = [] - for fmt in available_fmts: - stream_quality, stream_format = self._FORMAT_IDS[fmt] - video_format = fmt + 'p' - stream_infos = [] - streamdata = self._call_rpc_api( - 'VideoPlayer_GetStandardConfig', video_id, - 'Downloading media info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_quality': stream_quality, - 'current_page': url, - }) - if streamdata is not None: - stream_info = streamdata.find('./{default}preload/stream_info') + for stream in media.get('streams', []): + formats.extend(self._extract_vrv_formats( + stream.get('url'), video_id, stream.get('format'), + stream.get('audio_lang'), stream.get('hardsub_lang'))) + if not formats: + available_fmts = [] + for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): + attrs = extract_attributes(a) + href = attrs.get('href') + if href and '/freetrial' in href: + continue + available_fmts.append(fmt) + if not available_fmts: + for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): + available_fmts = re.findall(p, webpage) + if available_fmts: + break + if not available_fmts: + available_fmts = self._FORMAT_IDS.keys() + video_encode_ids = [] + + for fmt in available_fmts: + stream_quality, stream_format = self._FORMAT_IDS[fmt] + video_format = fmt + 'p' + stream_infos = [] + streamdata = self._call_rpc_api( + 'VideoPlayer_GetStandardConfig', video_id, + 'Downloading media info for %s' % video_format, data={ + 'media_id': video_id, + 'video_format': stream_format, + 'video_quality': stream_quality, + 'current_page': url, + }) + if streamdata is not None: + stream_info = streamdata.find('./{default}preload/stream_info') + if stream_info is not None: + stream_infos.append(stream_info) + stream_info = self._call_rpc_api( + 'VideoEncode_GetStreamInfo', video_id, + 'Downloading stream info for %s' % video_format, data={ + 'media_id': video_id, + 'video_format': stream_format, + 'video_encode_quality': stream_quality, + }) if stream_info is not None: stream_infos.append(stream_info) - stream_info = self._call_rpc_api( - 'VideoEncode_GetStreamInfo', video_id, - 'Downloading stream info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_encode_quality': stream_quality, - }) - if stream_info is not None: - stream_infos.append(stream_info) - for stream_info in stream_infos: - video_encode_id = xpath_text(stream_info, './video_encode_id') - if video_encode_id in video_encode_ids: - continue - video_encode_ids.append(video_encode_id) - - video_file = xpath_text(stream_info, './file') - if not video_file: - continue - if video_file.startswith('http'): - formats.extend(self._extract_m3u8_formats( - video_file, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue + for stream_info in stream_infos: + video_encode_id = xpath_text(stream_info, './video_encode_id') + if video_encode_id in video_encode_ids: + continue + video_encode_ids.append(video_encode_id) - video_url = xpath_text(stream_info, './host') - if not video_url: - continue - metadata = stream_info.find('./metadata') - format_info = { - 'format': video_format, - 'height': int_or_none(xpath_text(metadata, './height')), - 'width': int_or_none(xpath_text(metadata, './width')), - } - - if '.fplive.net/' in video_url: - video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) - parsed_video_url = compat_urlparse.urlparse(video_url) - direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( - netloc='v.lvlt.crcdn.net', - path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) - if self._is_valid_url(direct_video_url, video_id, video_format): - format_info.update({ - 'format_id': 'http-' + video_format, - 'url': direct_video_url, - }) - formats.append(format_info) + video_file = xpath_text(stream_info, './file') + if not video_file: + continue + if video_file.startswith('http'): + formats.extend(self._extract_m3u8_formats( + video_file, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) continue - format_info.update({ - 'format_id': 'rtmp-' + video_format, - 'url': video_url, - 'play_path': video_file, - 'ext': 'flv', - }) - formats.append(format_info) + video_url = xpath_text(stream_info, './host') + if not video_url: + continue + metadata = stream_info.find('./metadata') + format_info = { + 'format': video_format, + 'height': int_or_none(xpath_text(metadata, './height')), + 'width': int_or_none(xpath_text(metadata, './width')), + } + + if '.fplive.net/' in video_url: + video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) + parsed_video_url = compat_urlparse.urlparse(video_url) + direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( + netloc='v.lvlt.crcdn.net', + path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) + if self._is_valid_url(direct_video_url, video_id, video_format): + format_info.update({ + 'format_id': 'http-' + video_format, + 'url': direct_video_url, + }) + formats.append(format_info) + continue + + format_info.update({ + 'format_id': 'rtmp-' + video_format, + 'url': video_url, + 'play_path': video_file, + 'ext': 'flv', + }) + formats.append(format_info) self._sort_formats(formats, ('height', 'width', 'tbr', 'fps')) metadata = self._call_rpc_api( @@ -549,7 +563,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'media_id': video_id, }) - subtitles = self.extract_subtitles(video_id, webpage) + subtitles = {} + for subtitle in media.get('subtitles', []): + subtitle_url = subtitle.get('url') + if not subtitle_url: + continue + subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({ + 'url': subtitle_url, + 'ext': subtitle.get('format', 'ass'), + }) + if not subtitles: + subtitles = self.extract_subtitles(video_id, webpage) # webpage provide more accurate data than series_title from XML series = self._html_search_regex( @@ -557,8 +581,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text webpage, 'series', fatal=False) season = xpath_text(metadata, 'series_title') - episode = xpath_text(metadata, 'episode_title') - episode_number = int_or_none(xpath_text(metadata, 'episode_number')) + episode = xpath_text(metadata, 'episode_title') or media_metadata.get('title') + episode_number = int_or_none(xpath_text(metadata, 'episode_number') or media_metadata.get('episode_number')) season_number = int_or_none(self._search_regex( r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', @@ -568,7 +592,8 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text 'id': video_id, 'title': video_title, 'description': video_description, - 'thumbnail': xpath_text(metadata, 'episode_image_url'), + 'duration': float_or_none(media_metadata.get('duration'), 1000), + 'thumbnail': xpath_text(metadata, 'episode_image_url') or media_metadata.get('thumbnail', {}).get('url'), 'uploader': video_uploader, 'upload_date': video_upload_date, 'series': series, diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py @@ -72,7 +72,7 @@ class VRVBaseIE(InfoExtractor): class VRVIE(VRVBaseIE): IE_NAME = 'vrv' _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', 'info_dict': { 'id': 'GR9PNZ396', @@ -85,7 +85,28 @@ class VRVIE(VRVBaseIE): # m3u8 download 'skip_download': True, }, - } + }] + + def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): + if not url or stream_format not in ('hls', 'dash'): + return [] + stream_id = hardsub_lang or audio_lang + format_id = '%s-%s' % (stream_format, stream_id) + if stream_format == 'hls': + adaptive_formats = self._extract_m3u8_formats( + url, video_id, 'mp4', m3u8_id=format_id, + note='Downloading %s m3u8 information' % stream_id, + fatal=False) + elif stream_format == 'dash': + adaptive_formats = self._extract_mpd_formats( + url, video_id, mpd_id=format_id, + note='Downloading %s MPD information' % stream_id, + fatal=False) + if audio_lang: + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_lang + return adaptive_formats def _real_extract(self, url): video_id = self._match_id(url) @@ -115,26 +136,9 @@ class VRVIE(VRVBaseIE): for stream_type, streams in streams_json.get('streams', {}).items(): if stream_type in ('adaptive_hls', 'adaptive_dash'): for stream in streams.values(): - stream_url = stream.get('url') - if not stream_url: - continue - stream_id = stream.get('hardsub_locale') or audio_locale - format_id = '%s-%s' % (stream_type.split('_')[1], stream_id) - if stream_type == 'adaptive_hls': - adaptive_formats = self._extract_m3u8_formats( - stream_url, video_id, 'mp4', m3u8_id=format_id, - note='Downloading %s m3u8 information' % stream_id, - fatal=False) - else: - adaptive_formats = self._extract_mpd_formats( - stream_url, video_id, mpd_id=format_id, - note='Downloading %s MPD information' % stream_id, - fatal=False) - if audio_locale: - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = audio_locale - formats.extend(adaptive_formats) + formats.extend(self._extract_vrv_formats( + stream.get('url'), video_id, stream_type.split('_')[1], + audio_locale, stream.get('hardsub_locale'))) self._sort_formats(formats) subtitles = {}