commit ec240a43696478e43abb15e7c91f067b2bd5fe08
parent cd3a3ff93bd5d6866d3822cb438b0e172ffe4e39
Author: Remita Amine <remitamine@gmail.com>
Date:   Sat, 28 Jul 2018 20:29:56 +0100

[dailymotion:playlist] fix extraction(closes #16894)

Diffstat:
Myoutube_dl/extractor/dailymotion.py | 124+++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
1 file changed, 83 insertions(+), 41 deletions(-)

diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import base64 +import functools import hashlib import itertools import json @@ -16,11 +17,13 @@ from ..utils import ( error_to_compat_str, ExtractorError, int_or_none, + mimetype2ext, + OnDemandPagedList, parse_iso8601, sanitized_Request, str_to_int, unescapeHTML, - mimetype2ext, + urlencode_postdata, ) @@ -343,58 +346,73 @@ class DailymotionIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)' - _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' - _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)' _TESTS = [{ 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q', 'info_dict': { 'title': 'SPORT', - 'id': 'xv4bw_nqtv_sport', + 'id': 'xv4bw', }, 'playlist_mincount': 20, }] - - def _extract_entries(self, id): - video_ids = set() - processed_urls = set() - for pagenum in itertools.count(1): - page_url = self._PAGE_TEMPLATE % (id, pagenum) - webpage, urlh = self._download_webpage_handle_no_ff( - page_url, id, 'Downloading page %s' % pagenum) - if urlh.geturl() in processed_urls: - self.report_warning('Stopped at duplicated page %s, which is the same as %s' % ( - page_url, urlh.geturl()), id) - break - - processed_urls.add(urlh.geturl()) - - for video_id in re.findall(r'data-xid="(.+?)"', webpage): - if video_id not in video_ids: - yield self.url_result( - 'http://www.dailymotion.com/video/%s' % video_id, - DailymotionIE.ie_key(), video_id) - video_ids.add(video_id) - - if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: - break + _PAGE_SIZE = 100 + + def _fetch_page(self, playlist_id, authorizaion, page): + page += 1 + videos = self._download_json( + 'https://graphql.api.dailymotion.com', + playlist_id, 'Downloading page %d' % page, + data=json.dumps({ + 'query': '''{ + collection(xid: "%s") { + videos(first: %d, page: %d) { + pageInfo { + hasNextPage + nextPage + } + edges { + node { + xid + url + } + } + } + } +}''' % (playlist_id, self._PAGE_SIZE, page) + }).encode(), headers={ + 'Authorization': authorizaion, + 'Origin': 'https://www.dailymotion.com', + })['data']['collection']['videos'] + for edge in videos['edges']: + node = edge['node'] + yield self.url_result( + node['url'], DailymotionIE.ie_key(), node['xid']) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') + playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': self._og_search_title(webpage), - 'entries': self._extract_entries(playlist_id), - } - - -class DailymotionUserIE(DailymotionPlaylistIE): + api = self._parse_json(self._search_regex( + r'__PLAYER_CONFIG__\s*=\s*({.+?});', + webpage, 'player config'), playlist_id)['context']['api'] + auth = self._download_json( + api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'), + playlist_id, data=urlencode_postdata({ + 'client_id': api.get('client_id', 'f1a362d288c1b98099c7'), + 'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'), + 'grant_type': 'client_credentials', + })) + authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token']) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE) + return self.playlist_result( + entries, playlist_id, + self._og_search_title(webpage)) + + +class DailymotionUserIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:user' _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)' + _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"' _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s' _TESTS = [{ 'url': 'https://www.dailymotion.com/user/nqtv', @@ -416,6 +434,30 @@ class DailymotionUserIE(DailymotionPlaylistIE): 'skip': 'Takes too long time', }] + def _extract_entries(self, id): + video_ids = set() + processed_urls = set() + for pagenum in itertools.count(1): + page_url = self._PAGE_TEMPLATE % (id, pagenum) + webpage, urlh = self._download_webpage_handle_no_ff( + page_url, id, 'Downloading page %s' % pagenum) + if urlh.geturl() in processed_urls: + self.report_warning('Stopped at duplicated page %s, which is the same as %s' % ( + page_url, urlh.geturl()), id) + break + + processed_urls.add(urlh.geturl()) + + for video_id in re.findall(r'data-xid="(.+?)"', webpage): + if video_id not in video_ids: + yield self.url_result( + 'http://www.dailymotion.com/video/%s' % video_id, + DailymotionIE.ie_key(), video_id) + video_ids.add(video_id) + + if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: + break + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user = mobj.group('user')