commit a42839e548d81ae20e5164ae690075d2c423477e
parent d6166a7602f5b78a4bb552ba0f4b176cbc0a4a03
Author: Sergey M․ <dstftw@gmail.com>
Date:   Mon, 16 Apr 2018 00:31:25 +0700

[picarto] Improve extraction (closes #6205, closes #12514, closes #15276, closes #15551)

Diffstat:
Myoutube_dl/extractor/extractors.py | 2+-
Myoutube_dl/extractor/picarto.py | 152++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
2 files changed, 116 insertions(+), 38 deletions(-)

diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py @@ -816,8 +816,8 @@ from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE from .picarto import ( - PicartoVodIE, PicartoIE, + PicartoVodIE, ) from .piksel import PikselIE from .pinkbike import PinkbikeIE diff --git a/youtube_dl/extractor/picarto.py b/youtube_dl/extractor/picarto.py @@ -1,12 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals +import time + from .common import InfoExtractor -from ..utils import ExtractorError, js_to_json, urlencode_postdata +from ..compat import compat_str +from ..utils import ( + ExtractorError, + js_to_json, + try_get, + update_url_query, + urlencode_postdata, +) class PicartoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)[^/]*$' + _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)' _TEST = { 'url': 'https://picarto.tv/Setz', 'info_dict': { @@ -16,72 +25,141 @@ class PicartoIE(InfoExtractor): 'timestamp': int, 'is_live': True }, - 'params': { - 'skip_download': True - } + 'skip': 'Stream is offline', } + @classmethod + def suitable(cls, url): + return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url) + def _real_extract(self, url): channel_id = self._match_id(url) stream_page = self._download_webpage(url, channel_id) - if 'This channel does not exist.' in stream_page: - raise ExtractorError('Channel does not exist', expected=True) + if '>This channel does not exist' in stream_page: + raise ExtractorError( + 'Channel %s does not exist' % channel_id, expected=True) - player_settings_js = self._html_search_regex( - r'(?s)playerSettings\[1\]\s*=\s*(\{.+?\}\n)', stream_page, 'player-settings') - player_settings = self._parse_json(player_settings_js, channel_id, - transform_source=js_to_json) - if not player_settings.get('online'): + player = self._parse_json( + self._search_regex( + r'(?s)playerSettings\[\d+\]\s*=\s*(\{.+?\}\s*\n)', stream_page, + 'player settings'), + channel_id, transform_source=js_to_json) + + if player.get('online') is False: raise ExtractorError('Stream is offline', expected=True) - cdn_data = self._download_json('https://picarto.tv/process/channel', channel_id, + cdn_data = self._download_json( + 'https://picarto.tv/process/channel', channel_id, data=urlencode_postdata({'loadbalancinginfo': channel_id}), - note='Fetching load balancer info') - edge = [edge['ep'] for edge in cdn_data['edges'] if edge['id'] == cdn_data['preferedEdge']][0] + note='Downloading load balancing info') + + def get_event(key): + return try_get(player, lambda x: x['event'][key], compat_str) or '' - formats = self._extract_m3u8_formats('https://%s/hls/%s/index.m3u8' % (edge, channel_id), - channel_id, 'mp4') - formats.append({'url': 'https://%s/mp4/%s.mp4' % (edge, channel_id)}) + params = { + 'token': player.get('token') or '', + 'ticket': get_event('ticket'), + 'con': int(time.time() * 1000), + 'type': get_event('ticket'), + 'scope': get_event('scope'), + } + + prefered_edge = cdn_data.get('preferedEdge') + default_tech = player.get('defaultTech') + + formats = [] + + for edge in cdn_data['edges']: + edge_ep = edge.get('ep') + if not edge_ep or not isinstance(edge_ep, compat_str): + continue + edge_id = edge.get('id') + for tech in cdn_data['techs']: + tech_label = tech.get('label') + tech_type = tech.get('type') + preference = 0 + if edge_id == prefered_edge: + preference += 1 + if tech_type == default_tech: + preference += 1 + format_id = [] + if edge_id: + format_id.append(edge_id) + if tech_type == 'application/x-mpegurl' or tech_label == 'HLS': + format_id.append('hls') + formats.extend(self._extract_m3u8_formats( + update_url_query( + 'https://%s/hls/%s/index.m3u8' + % (edge_ep, channel_id), params), + channel_id, 'mp4', preference=preference, + m3u8_id='-'.join(format_id), fatal=False)) + continue + elif tech_type == 'video/mp4' or tech_label == 'MP4': + format_id.append('mp4') + formats.append({ + 'url': update_url_query( + 'https://%s/mp4/%s.mp4' % (edge_ep, channel_id), + params), + 'format_id': '-'.join(format_id), + 'preference': preference, + }) + else: + # rtmp format does not seem to work + continue self._sort_formats(formats) + mature = player.get('mature') + if mature is None: + age_limit = None + else: + age_limit = 18 if mature is True else 0 + return { 'id': channel_id, - 'formats': formats, - 'ext': 'mp4', 'title': self._live_title(channel_id), 'is_live': True, - 'thumbnail': player_settings.get('vodThumb'), - 'age_limit': 18 if player_settings.get('mature') else None, + 'thumbnail': player.get('vodThumb'), + 'age_limit': age_limit, + 'formats': formats, } class PicartoVodIE(InfoExtractor): - _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[a-zA-Z0-9_\-\.]+).flv' - _TEST = { - 'url': 'https://picarto.tv/videopopout/Carrot_2018.01.11.07.55.12.flv', - 'md5': '80765b67813053ff31d4df2bd5e900ce', + _VALID_URL = r'https?://(?:www.)?picarto\.tv/videopopout/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://picarto.tv/videopopout/ArtofZod_2017.12.12.00.13.23.flv', + 'md5': '3ab45ba4352c52ee841a28fb73f2d9ca', 'info_dict': { - 'id': 'Carrot_2018.01.11.07.55.12', + 'id': 'ArtofZod_2017.12.12.00.13.23.flv', 'ext': 'mp4', - 'title': 'Carrot_2018.01.11.07.55.12', - 'thumbnail': r're:^https?://.*\.jpg$' - } - } + 'title': 'ArtofZod_2017.12.12.00.13.23.flv', + 'thumbnail': r're:^https?://.*\.jpg' + }, + }, { + 'url': 'https://picarto.tv/videopopout/Plague', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - vod_info_js = self._html_search_regex(r'(?s)"#vod-player",\s*(\{.+?\})\)', - webpage, video_id) - vod_info = self._parse_json(vod_info_js, video_id, transform_source=js_to_json) + vod_info = self._parse_json( + self._search_regex( + r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, + video_id), + video_id, transform_source=js_to_json) + + formats = self._extract_m3u8_formats( + vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) return { 'id': video_id, 'title': video_id, - 'ext': 'mp4', - 'protocol': 'm3u8', - 'url': vod_info['vod'], 'thumbnail': vod_info.get('vodThumb'), + 'formats': formats, }