commit 59296bae7ec6d15b0df37dce34bdd96381c0e743
parent 755541a4c8ac3dd4e8b9abd0c7df95182a1f3fd4
Author: Remita Amine <remitamine@gmail.com>
Date:   Thu, 17 Oct 2019 13:26:45 +0100

[xfileshare] clean extractor

- update the list of domains
- add support for aa-encoded video data
- improve jwplayer format extraction
- add support for Clappr sources

closes #17032
closes #17906
closes #18237
closes #18239

Diffstat:
Myoutube_dl/extractor/xfileshare.py | 192+++++++++++++++++++++++++++++++++++--------------------------------------------
1 file changed, 86 insertions(+), 106 deletions(-)

diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py @@ -4,37 +4,64 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_chr from ..utils import ( decode_packed_codes, determine_ext, ExtractorError, int_or_none, - NO_DEFAULT, + js_to_json, urlencode_postdata, ) +# based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58 +def aa_decode(aa_code): + symbol_table = [ + ('7', '((゚ー゚) + (o^_^o))'), + ('6', '((o^_^o) +(o^_^o))'), + ('5', '((゚ー゚) + (゚Θ゚))'), + ('2', '((o^_^o) - (゚Θ゚))'), + ('4', '(゚ー゚)'), + ('3', '(o^_^o)'), + ('1', '(゚Θ゚)'), + ('0', '(c^_^o)'), + ] + delim = '(゚Д゚)[゚ε゚]+' + ret = '' + for aa_char in aa_code.split(delim): + for val, pat in symbol_table: + aa_char = aa_char.replace(pat, val) + aa_char = aa_char.replace('+ ', '') + m = re.match(r'^\d+', aa_char) + if m: + ret += compat_chr(int(m.group(0), 8)) + else: + m = re.match(r'^u([\da-f]+)', aa_char) + if m: + ret += compat_chr(int(m.group(1), 16)) + return ret + + class XFileShareIE(InfoExtractor): _SITES = ( - (r'daclips\.(?:in|com)', 'DaClips'), - (r'filehoot\.com', 'FileHoot'), - (r'gorillavid\.(?:in|com)', 'GorillaVid'), - (r'movpod\.in', 'MovPod'), - (r'powerwatch\.pw', 'PowerWatch'), - (r'rapidvideo\.ws', 'Rapidvideo.ws'), + (r'clipwatching\.com', 'ClipWatching'), + (r'gounlimited\.to', 'GoUnlimited'), + (r'govid\.me', 'GoVid'), + (r'holavid\.com', 'HolaVid'), + (r'streamty\.com', 'Streamty'), (r'thevideobee\.to', 'TheVideoBee'), - (r'vidto\.(?:me|se)', 'Vidto'), - (r'streamin\.to', 'Streamin.To'), - (r'xvidstage\.com', 'XVIDSTAGE'), - (r'vidabc\.com', 'Vid ABC'), + (r'uqload\.com', 'Uqload'), (r'vidbom\.com', 'VidBom'), (r'vidlo\.us', 'vidlo'), - (r'rapidvideo\.(?:cool|org)', 'RapidVideo.TV'), - (r'fastvideo\.me', 'FastVideo.me'), + (r'vidlocker\.xyz', 'VidLocker'), + (r'vidshare\.tv', 'VidShare'), + (r'vup\.to', 'VUp'), + (r'xvideosharing\.com', 'XVideoSharing'), ) IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) - _VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' + _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' % '|'.join(site for site in list(zip(*_SITES))[0])) _FILE_NOT_FOUND_REGEXES = ( @@ -43,82 +70,14 @@ class XFileShareIE(InfoExtractor): ) _TESTS = [{ - 'url': 'http://gorillavid.in/06y9juieqpmi', - 'md5': '5ae4a3580620380619678ee4875893ba', - 'info_dict': { - 'id': '06y9juieqpmi', - 'ext': 'mp4', - 'title': 'Rebecca Black My Moment Official Music Video Reaction-6GK87Rc8bzQ', - 'thumbnail': r're:http://.*\.jpg', - }, - }, { - 'url': 'http://gorillavid.in/embed-z08zf8le23c6-960x480.html', - 'only_matching': True, - }, { - 'url': 'http://daclips.in/3rso4kdn6f9m', - 'md5': '1ad8fd39bb976eeb66004d3a4895f106', + 'url': 'http://xvideosharing.com/fq65f94nd2ve', + 'md5': '4181f63957e8fe90ac836fa58dc3c8a6', 'info_dict': { - 'id': '3rso4kdn6f9m', + 'id': 'fq65f94nd2ve', 'ext': 'mp4', - 'title': 'Micro Pig piglets ready on 16th July 2009-bG0PdrCdxUc', + 'title': 'sample', 'thumbnail': r're:http://.*\.jpg', - } - }, { - 'url': 'http://movpod.in/0wguyyxi1yca', - 'only_matching': True, - }, { - 'url': 'http://filehoot.com/3ivfabn7573c.html', - 'info_dict': { - 'id': '3ivfabn7573c', - 'ext': 'mp4', - 'title': 'youtube-dl test video \'äBaW_jenozKc.mp4.mp4', - 'thumbnail': r're:http://.*\.jpg', - }, - 'skip': 'Video removed', - }, { - 'url': 'http://vidto.me/ku5glz52nqe1.html', - 'info_dict': { - 'id': 'ku5glz52nqe1', - 'ext': 'mp4', - 'title': 'test' - } - }, { - 'url': 'http://powerwatch.pw/duecjibvicbu', - 'info_dict': { - 'id': 'duecjibvicbu', - 'ext': 'mp4', - 'title': 'Big Buck Bunny trailer', - }, - }, { - 'url': 'http://xvidstage.com/e0qcnl03co6z', - 'info_dict': { - 'id': 'e0qcnl03co6z', - 'ext': 'mp4', - 'title': 'Chucky Prank 2015.mp4', - }, - }, { - # removed by administrator - 'url': 'http://xvidstage.com/amfy7atlkx25', - 'only_matching': True, - }, { - 'url': 'http://vidabc.com/i8ybqscrphfv', - 'info_dict': { - 'id': 'i8ybqscrphfv', - 'ext': 'mp4', - 'title': 're:Beauty and the Beast 2017', - }, - 'params': { - 'skip_download': True, }, - }, { - 'url': 'http://www.rapidvideo.cool/b667kprndr8w', - 'only_matching': True, - }, { - 'url': 'http://www.fastvideo.me/k8604r8nk8sn/FAST_FURIOUS_8_-_Trailer_italiano_ufficiale.mp4.html', - 'only_matching': True, - }, { - 'url': 'http://vidto.se/1tx1pf6t12cg.html', - 'only_matching': True, }] @staticmethod @@ -131,10 +90,9 @@ class XFileShareIE(InfoExtractor): webpage)] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + host, video_id = re.match(self._VALID_URL, url).groups() - url = 'http://%s/%s' % (mobj.group('host'), video_id) + url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id) webpage = self._download_webpage(url, video_id) if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES): @@ -142,7 +100,7 @@ class XFileShareIE(InfoExtractor): fields = self._hidden_inputs(webpage) - if fields['op'] == 'download1': + if fields.get('op') == 'download1': countdown = int_or_none(self._search_regex( r'<span id="countdown_str">(?:[Ww]ait)?\s*<span id="cxc">(\d+)</span>\s*(?:seconds?)?</span>', webpage, 'countdown', default=None)) @@ -160,13 +118,37 @@ class XFileShareIE(InfoExtractor): (r'style="z-index: [0-9]+;">([^<]+)</span>', r'<td nowrap>([^<]+)</td>', r'h4-fine[^>]*>([^<]+)<', - r'>Watch (.+) ', + r'>Watch (.+)[ <]', r'<h2 class="video-page-head">([^<]+)</h2>', - r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<'), # streamin.to + r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<', # streamin.to + r'title\s*:\s*"([^"]+)"'), # govid.me webpage, 'title', default=None) or self._og_search_title( webpage, default=None) or video_id).strip() - def extract_formats(default=NO_DEFAULT): + for regex, func in ( + (r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes), + (r'(゚.+)', aa_decode)): + obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None) + if obf_code: + webpage = webpage.replace(obf_code, func(obf_code)) + + formats = [] + + jwplayer_data = self._search_regex( + [ + r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);', + r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);', + ], webpage, + 'jwplayer data', default=None) + if jwplayer_data: + jwplayer_data = self._parse_json( + jwplayer_data.replace(r"\'", "'"), video_id, js_to_json) + if jwplayer_data: + formats = self._parse_jwplayer_data( + jwplayer_data, video_id, False, + m3u8_id='hls', mpd_id='dash')['formats'] + + if not formats: urls = [] for regex in ( r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', @@ -177,6 +159,12 @@ class XFileShareIE(InfoExtractor): video_url = mobj.group('url') if video_url not in urls: urls.append(video_url) + + sources = self._search_regex( + r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None) + if sources: + urls.extend(self._parse_json(sources, video_id)) + formats = [] for video_url in urls: if determine_ext(video_url) == 'm3u8': @@ -189,21 +177,13 @@ class XFileShareIE(InfoExtractor): 'url': video_url, 'format_id': 'sd', }) - if not formats and default is not NO_DEFAULT: - return default - self._sort_formats(formats) - return formats - - formats = extract_formats(default=None) - - if not formats: - webpage = decode_packed_codes(self._search_regex( - r"(}\('(.+)',(\d+),(\d+),'[^']*\b(?:file|embed)\b[^']*'\.split\('\|'\))", - webpage, 'packed code')) - formats = extract_formats() + self._sort_formats(formats) thumbnail = self._search_regex( - r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None) + [ + r'<video[^>]+poster="([^"]+)"', + r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],', + ], webpage, 'thumbnail', default=None) return { 'id': video_id,