commit 15699ec8b0a9cb8d519b721a5cb8199afe62fc3c
parent 33cc1ea586480ccc60fd25f2d42cfb44eec605c6
Author: Sergey M․ <dstftw@gmail.com>
Date:   Fri,  7 Dec 2018 00:49:24 +0700

[nrktv:season,series] Fix extraction and update tests (closes #17159, closes #17258)

Diffstat:
Myoutube_dl/extractor/nrk.py | 68+++++++++++++++++++++++++++++++++++++++++++-------------------------
1 file changed, 43 insertions(+), 25 deletions(-)

diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py @@ -211,13 +211,13 @@ class NRKIE(NRKBaseIE): _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '2f7f6eeb2aacdd99885f355428715cfa', + 'md5': '706f34cdf1322577589e369e522b50ef', 'info_dict': { 'id': '150533', 'ext': 'mp4', 'title': 'Dompap og andre fugler i Piip-Show', 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 263, + 'duration': 262, } }, { # audio @@ -256,14 +256,14 @@ class NRKTVIE(NRKBaseIE): _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '4e9ca6629f09e588ed240fb11619922a', + 'md5': '9a167e54d04671eb6317a37b7bc8a280', 'info_dict': { 'id': 'MUHH48000314AA', 'ext': 'mp4', 'title': '20 spørsmål 23.05.2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, - 'series': '20 spørsmål - TV', + 'series': '20 spørsmål', 'episode': '23.05.2014', }, }, { @@ -301,7 +301,7 @@ class NRKTVIE(NRKBaseIE): 'id': 'MSPO40010515AH', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', 'duration': 772, 'series': 'Tour de Ski', 'episode': '06.01.2015', @@ -314,7 +314,7 @@ class NRKTVIE(NRKBaseIE): 'id': 'MSPO40010515BH', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', 'duration': 6175, 'series': 'Tour de Ski', 'episode': '06.01.2015', @@ -326,7 +326,7 @@ class NRKTVIE(NRKBaseIE): 'info_dict': { 'id': 'MSPO40010515', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', }, 'expected_warnings': ['Video is geo restricted'], }, { @@ -406,21 +406,35 @@ class NRKTVSerieBaseIE(InfoExtractor): def _extract_series(self, webpage, display_id, fatal=True): config = self._parse_json( self._search_regex( - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', webpage, 'config', - default='{}' if not fatal else NO_DEFAULT), + (r'INITIAL_DATA_*\s*=\s*({.+?})\s*;', + r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>'), + webpage, 'config', default='{}' if not fatal else NO_DEFAULT), display_id, fatal=False) if not config: return - return try_get(config, lambda x: x['series'], dict) + return try_get( + config, + (lambda x: x['initialState']['series'], lambda x: x['series']), + dict) + + def _extract_seasons(self, seasons): + if not isinstance(seasons, list): + return [] + entries = [] + for season in seasons: + entries.extend(self._extract_episodes(season)) + return entries def _extract_episodes(self, season): - entries = [] if not isinstance(season, dict): - return entries - episodes = season.get('episodes') - if not isinstance(episodes, list): - return entries - for episode in episodes: + return [] + return self._extract_entries(season.get('episodes')) + + def _extract_entries(self, entry_list): + if not isinstance(entry_list, list): + return [] + entries = [] + for episode in entry_list: nrk_id = episode.get('prfId') if not nrk_id or not isinstance(nrk_id, compat_str): continue @@ -465,7 +479,7 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' _TESTS = [{ - # new layout + # new layout, seasons 'url': 'https://tv.nrk.no/serie/backstage', 'info_dict': { 'id': 'backstage', @@ -474,20 +488,21 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, 'playlist_mincount': 60, }, { - # old layout + # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', 'info_dict': { 'id': 'groenn-glede', 'title': 'Grønn glede', 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', }, - 'playlist_mincount': 9, + 'playlist_mincount': 10, }, { - 'url': 'http://tv.nrksuper.no/serie/labyrint', + # old layout + 'url': 'https://tv.nrksuper.no/serie/labyrint', 'info_dict': { 'id': 'labyrint', 'title': 'Labyrint', - 'description': 'md5:58afd450974c89e27d5a19212eee7115', + 'description': 'md5:318b597330fdac5959247c9b69fdb1ec', }, 'playlist_mincount': 3, }, { @@ -520,11 +535,11 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): description = try_get( series, lambda x: x['titles']['subtitle'], compat_str) entries = [] - for season in series['seasons']: - entries.extend(self._extract_episodes(season)) + entries.extend(self._extract_seasons(series.get('seasons'))) + entries.extend(self._extract_entries(series.get('instalments'))) return self.playlist_result(entries, series_id, title, description) - # Old layout (e.g. https://tv.nrk.no/serie/groenn-glede) + # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) entries = [ self.url_result( 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( @@ -536,6 +551,9 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'seriestitle', webpage, 'title', default=None) or self._og_search_title( webpage, fatal=False) + if title: + title = self._search_regex( + r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title) description = self._html_search_meta( 'series_description', webpage, @@ -596,7 +614,7 @@ class NRKPlaylistIE(NRKPlaylistBaseIE): 'title': 'Rivertonprisen til Karin Fossum', 'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.', }, - 'playlist_count': 5, + 'playlist_count': 2, }] def _extract_title(self, webpage):