commit 87f89dacddfa46399aea9252ca078f5f386dce38
parent 9b0b62753432244fd062d99cc5dc604d6bad7877
Author: Sergey M․ <dstftw@gmail.com>
Date:   Sat, 16 Jun 2018 02:55:20 +0700

[pbs] Improve extraction (closes #16623, closes #16684)

Diffstat:
Myoutube_dl/extractor/pbs.py | 57++++++++++++++++++++++++++++++++++++++++++++++++++-------
1 file changed, 50 insertions(+), 7 deletions(-)

diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -376,6 +377,35 @@ class PBSIE(InfoExtractor): 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { + 'url': 'https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/', + 'info_dict': { + 'id': '3007193718', + 'ext': 'mp4', + 'title': "Victoria - A Soldier's Daughter / The Green-Eyed Monster", + 'description': 'md5:37efbac85e0c09b009586523ec143652', + 'duration': 6292, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { + 'url': 'https://player.pbs.org/partnerplayer/tOz9tM5ljOXQqIIWke53UA==/', + 'info_dict': { + 'id': '3011407934', + 'ext': 'mp4', + 'title': 'Stories from the Stage - Road Trip', + 'duration': 1619, + 'thumbnail': r're:^https?://.*\.(?:jpg|JPG)$', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, + { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, }, @@ -438,6 +468,7 @@ class PBSIE(InfoExtractor): r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>', # jwplayer r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", r'<div[^>]+\bdata-cove-id=["\'](\d+)"', # http://www.pbs.org/wgbh/roadshow/watch/episode/2105-indianapolis-hour-2/ + r'<iframe[^>]+\bsrc=["\'](?:https?:)?//video\.pbs\.org/widget/partnerplayer/(\d+)', # https://www.pbs.org/wgbh/masterpiece/episodes/victoria-s2-e1/ ] media_id = self._search_regex( @@ -472,7 +503,8 @@ class PBSIE(InfoExtractor): if not url: url = self._og_search_url(webpage) - mobj = re.match(self._VALID_URL, url) + mobj = re.match( + self._VALID_URL, self._proto_relative_url(url.strip())) player_id = mobj.group('player_id') if not display_id: @@ -482,13 +514,27 @@ class PBSIE(InfoExtractor): url, display_id, note='Downloading player page', errnote='Could not download player page') video_id = self._search_regex( - r'<div\s+id="video_([0-9]+)"', player_page, 'video ID') + r'<div\s+id=["\']video_(\d+)', player_page, 'video ID', + default=None) + if not video_id: + video_info = self._extract_video_data( + player_page, 'video data', display_id) + video_id = compat_str( + video_info.get('id') or video_info['contentID']) else: video_id = mobj.group('id') display_id = video_id return video_id, display_id, None, description + def _extract_video_data(self, string, name, video_id, fatal=True): + return self._parse_json( + self._search_regex( + [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', + r'window\.videoBridge\s*=\s*({.+?});'], + string, name, default='{}'), + video_id, transform_source=js_to_json, fatal=fatal) + def _real_extract(self, url): video_id, display_id, upload_date, description = self._extract_webpage(url) @@ -519,11 +565,8 @@ class PBSIE(InfoExtractor): 'http://player.pbs.org/%s/%s' % (page, video_id), display_id, 'Downloading %s page' % page, fatal=False) if player: - video_info = self._parse_json( - self._search_regex( - [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', r'window\.videoBridge\s*=\s*({.+?});'], - player, '%s video data' % page, default='{}'), - display_id, transform_source=js_to_json, fatal=False) + video_info = self._extract_video_data( + player, '%s video data' % page, display_id, fatal=False) if video_info: extract_redirect_urls(video_info) if not info: