commit 72791634127cc3093592c807225ec684af1cfcc9
parent 07ab44c420a79d1faae09d00323242746e522c4c
Author: Sergey M․ <dstftw@gmail.com>
Date:   Wed, 31 Jul 2019 02:31:19 +0700

[tvn24] Fix metadata extraction (closes #21833, closes #21834)

Diffstat:
Myoutube_dl/extractor/tvn24.py | 42+++++++++++++++++++++++++++++++++---------
1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/youtube_dl/extractor/tvn24.py b/youtube_dl/extractor/tvn24.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( int_or_none, + NO_DEFAULT, unescapeHTML, ) @@ -21,6 +22,18 @@ class TVN24IE(InfoExtractor): 'thumbnail': 're:https?://.*[.]jpeg', } }, { + # different layout + 'url': 'https://tvnmeteo.tvn24.pl/magazyny/maja-w-ogrodzie,13/odcinki-online,1,4,1,0/pnacza-ptaki-i-iglaki-odc-691-hgtv-odc-29,1771763.html', + 'info_dict': { + 'id': '1771763', + 'ext': 'mp4', + 'title': 'Pnącza, ptaki i iglaki (odc. 691 /HGTV odc. 29)', + 'thumbnail': 're:https?://.*', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://fakty.tvn24.pl/ogladaj-online,60/53-konferencja-bezpieczenstwa-w-monachium,716431.html', 'only_matching': True, }, { @@ -35,18 +48,21 @@ class TVN24IE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, display_id) - title = self._og_search_title(webpage) + title = self._og_search_title( + webpage, default=None) or self._search_regex( + r'<h\d+[^>]+class=["\']magazineItemHeader[^>]+>(.+?)</h', + webpage, 'title') - def extract_json(attr, name, fatal=True): + def extract_json(attr, name, default=NO_DEFAULT, fatal=True): return self._parse_json( self._search_regex( r'\b%s=(["\'])(?P<json>(?!\1).+?)\1' % attr, webpage, - name, group='json', fatal=fatal) or '{}', - video_id, transform_source=unescapeHTML, fatal=fatal) + name, group='json', default=default, fatal=fatal) or '{}', + display_id, transform_source=unescapeHTML, fatal=fatal) quality_data = extract_json('data-quality', 'formats') @@ -59,16 +75,24 @@ class TVN24IE(InfoExtractor): }) self._sort_formats(formats) - description = self._og_search_description(webpage) + description = self._og_search_description(webpage, default=None) thumbnail = self._og_search_thumbnail( webpage, default=None) or self._html_search_regex( r'\bdata-poster=(["\'])(?P<url>(?!\1).+?)\1', webpage, 'thumbnail', group='url') + video_id = None + share_params = extract_json( - 'data-share-params', 'share params', fatal=False) + 'data-share-params', 'share params', default=None) if isinstance(share_params, dict): - video_id = share_params.get('id') or video_id + video_id = share_params.get('id') + + if not video_id: + video_id = self._search_regex( + r'data-vid-id=["\'](\d+)', webpage, 'video id', + default=None) or self._search_regex( + r',(\d+)\.html', url, 'video id', default=display_id) return { 'id': video_id,