commit 18ca61c5e153d1c1cb8b9a2de3c8b9dfdaa69b0e
parent 0b16b3c2d35d1706ec5c55e5b06352c753127368
Author: Remita Amine <remitamine@gmail.com>
Date:   Sat,  9 Nov 2019 09:23:20 +0100

[twitter] improve extraction

- add support for generic embeds(closes #22168)
- always extract http formats for native videos(closes #14934)
- add support for Twitter Broadcasts(closes #21369)
- extract more metadata
- improve VMap format extraction
- unify extraction code for both twitter statuses and cards

Diffstat:
Myoutube_dl/extractor/extractors.py | 1+
Myoutube_dl/extractor/periscope.py | 80++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Myoutube_dl/extractor/twitter.py | 578+++++++++++++++++++++++++++++++++++++++++--------------------------------------
3 files changed, 348 insertions(+), 311 deletions(-)

diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py @@ -1241,6 +1241,7 @@ from .twitter import ( TwitterCardIE, TwitterIE, TwitterAmplifyIE, + TwitterBroadcastIE, ) from .udemy import ( UdemyIE, diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py @@ -17,12 +17,54 @@ class PeriscopeBaseIE(InfoExtractor): 'https://api.periscope.tv/api/v2/%s' % method, item_id, query=query) + def _parse_broadcast_data(self, broadcast, video_id): + title = broadcast['status'] + uploader = broadcast.get('user_display_name') or broadcast.get('username') + title = '%s - %s' % (uploader, title) if uploader else title + is_live = broadcast.get('state').lower() == 'running' + + thumbnails = [{ + 'url': broadcast[image], + } for image in ('image_url', 'image_url_small') if broadcast.get(image)] + + return { + 'id': broadcast.get('id') or video_id, + 'title': self._live_title(title) if is_live else title, + 'timestamp': parse_iso8601(broadcast.get('created_at')), + 'uploader': uploader, + 'uploader_id': broadcast.get('user_id') or broadcast.get('username'), + 'thumbnails': thumbnails, + 'view_count': int_or_none(broadcast.get('total_watched')), + 'tags': broadcast.get('tags'), + 'is_live': is_live, + } + + @staticmethod + def _extract_common_format_info(broadcast): + return broadcast.get('state').lower(), int_or_none(broadcast.get('width')), int_or_none(broadcast.get('height')) + + @staticmethod + def _add_width_and_height(f, width, height): + for key, val in (('width', width), ('height', height)): + if not f.get(key): + f[key] = val + + def _extract_pscp_m3u8_formats(self, m3u8_url, video_id, format_id, state, width, height, fatal=True): + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native' + if state in ('ended', 'timed_out') else 'm3u8', + m3u8_id=format_id, fatal=fatal) + if len(m3u8_formats) == 1: + self._add_width_and_height(m3u8_formats[0], width, height) + return m3u8_formats + class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' - # Alive example URLs can be found here http://onperiscope.com/ + # Alive example URLs can be found here https://www.periscope.tv/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', 'md5': '65b57957972e503fcbbaeed8f4fa04ca', @@ -61,21 +103,9 @@ class PeriscopeIE(PeriscopeBaseIE): 'accessVideoPublic', {'broadcast_id': token}, token) broadcast = stream['broadcast'] - title = broadcast['status'] - - uploader = broadcast.get('user_display_name') or broadcast.get('username') - uploader_id = (broadcast.get('user_id') or broadcast.get('username')) + info = self._parse_broadcast_data(broadcast, token) - title = '%s - %s' % (uploader, title) if uploader else title state = broadcast.get('state').lower() - if state == 'running': - title = self._live_title(title) - timestamp = parse_iso8601(broadcast.get('created_at')) - - thumbnails = [{ - 'url': broadcast[image], - } for image in ('image_url', 'image_url_small') if broadcast.get(image)] - width = int_or_none(broadcast.get('width')) height = int_or_none(broadcast.get('height')) @@ -92,32 +122,20 @@ class PeriscopeIE(PeriscopeBaseIE): continue video_urls.add(video_url) if format_id != 'rtmp': - m3u8_formats = self._extract_m3u8_formats( - video_url, token, 'mp4', - entry_protocol='m3u8_native' - if state in ('ended', 'timed_out') else 'm3u8', - m3u8_id=format_id, fatal=False) - if len(m3u8_formats) == 1: - add_width_and_height(m3u8_formats[0]) + m3u8_formats = self._extract_pscp_m3u8_formats( + video_url, token, format_id, state, width, height, False) formats.extend(m3u8_formats) continue rtmp_format = { 'url': video_url, 'ext': 'flv' if format_id == 'rtmp' else 'mp4', } - add_width_and_height(rtmp_format) + self._add_width_and_height(rtmp_format) formats.append(rtmp_format) self._sort_formats(formats) - return { - 'id': broadcast.get('id') or token, - 'title': title, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'thumbnails': thumbnails, - 'formats': formats, - } + info['formats'] = formats + return info class PeriscopeUserIE(PeriscopeBaseIE): diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py @@ -4,32 +4,67 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_HTTPError, + compat_parse_qs, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, +) from ..utils import ( - determine_ext, dict_get, ExtractorError, float_or_none, int_or_none, - remove_end, try_get, + strip_or_none, + unified_timestamp, + update_url_query, xpath_text, ) -from .periscope import PeriscopeIE +from .periscope import ( + PeriscopeBaseIE, + PeriscopeIE, +) class TwitterBaseIE(InfoExtractor): + _API_BASE = 'https://api.twitter.com/1.1/' + _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?twitter\.com/' + _GUEST_TOKEN = None + + def _extract_variant_formats(self, variant, video_id): + variant_url = variant.get('url') + if not variant_url: + return [] + elif '.m3u8' in variant_url: + return self._extract_m3u8_formats( + variant_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False) + else: + tbr = int_or_none(dict_get(variant, ('bitrate', 'bit_rate')), 1000) or None + f = { + 'url': variant_url, + 'format_id': 'http' + ('-%d' % tbr if tbr else ''), + 'tbr': tbr, + } + self._search_dimensions_in_video_url(f, variant_url) + return [f] + def _extract_formats_from_vmap_url(self, vmap_url, video_id): vmap_data = self._download_xml(vmap_url, video_id) - video_url = xpath_text(vmap_data, './/MediaFile').strip() - if determine_ext(video_url) == 'm3u8': - return self._extract_m3u8_formats( - video_url, video_id, ext='mp4', m3u8_id='hls', - entry_protocol='m3u8_native') - return [{ - 'url': video_url, - }] + formats = [] + urls = [] + for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'): + video_variant.attrib['url'] = compat_urllib_parse_unquote( + video_variant.attrib['url']) + urls.append(video_variant.attrib['url']) + formats.extend(self._extract_variant_formats( + video_variant.attrib, video_id)) + video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile')) + if video_url not in urls: + formats.extend(self._extract_variant_formats({'url': video_url}, video_id)) + return formats @staticmethod def _search_dimensions_in_video_url(a_format, video_url): @@ -40,10 +75,30 @@ class TwitterBaseIE(InfoExtractor): 'height': int(m.group('height')), }) - -class TwitterCardIE(TwitterBaseIE): + def _call_api(self, path, video_id, query={}): + headers = { + 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', + } + if not self._GUEST_TOKEN: + self._GUEST_TOKEN = self._download_json( + self._API_BASE + 'guest/activate.json', video_id, + 'Downloading guest token', data=b'', + headers=headers)['guest_token'] + headers['x-guest-token'] = self._GUEST_TOKEN + try: + return self._download_json( + self._API_BASE + path, video_id, headers=headers, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), + video_id)['errors'][0]['message'], expected=True) + raise + + +class TwitterCardIE(InfoExtractor): IE_NAME = 'twitter:card' - _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?P<path>cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)' _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', @@ -51,19 +106,28 @@ class TwitterCardIE(TwitterBaseIE): 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', - 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': "Twitter - You can now shoot, edit and share video on Twitter. Capture life's most moving moments from your perspective.", + 'description': 'md5:18d3e24bb4f6e5007487dd546e53bd96', + 'uploader': 'Twitter', + 'uploader_id': 'Twitter', + 'thumbnail': r're:^https?://.*\.jpg', 'duration': 30.033, + 'timestamp': 1422366112, + 'upload_date': '20150127', }, }, { 'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768', - 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8', + 'md5': '7137eca597f72b9abbe61e5ae0161399', 'info_dict': { 'id': '623160978427936768', 'ext': 'mp4', - 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*$', + 'title': "NASA - Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video.", + 'description': "Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video. https://t.co/BJYgOjSeGA", + 'uploader': 'NASA', + 'uploader_id': 'NASA', + 'timestamp': 1437408129, + 'upload_date': '20150720', }, }, { @@ -75,7 +139,7 @@ class TwitterCardIE(TwitterBaseIE): 'title': 'Ubuntu 11.10 Overview', 'description': 'md5:a831e97fa384863d6e26ce48d1c43376', 'upload_date': '20111013', - 'uploader': 'OMG! Ubuntu!', + 'uploader': 'OMG! UBUNTU!', 'uploader_id': 'omgubuntu', }, 'add_ie': ['Youtube'], @@ -99,190 +163,30 @@ class TwitterCardIE(TwitterBaseIE): 'info_dict': { 'id': '705235433198714880', 'ext': 'mp4', - 'title': 'Twitter web player', - 'thumbnail': r're:^https?://.*', + 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.", + 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns", + 'uploader': 'Brent Yarina', + 'uploader_id': 'BTNBrentYarina', + 'timestamp': 1456976204, + 'upload_date': '20160303', }, + 'skip': 'This content is no longer available.', }, { 'url': 'https://twitter.com/i/videos/752274308186120192', 'only_matching': True, }, ] - _API_BASE = 'https://api.twitter.com/1.1' - - def _parse_media_info(self, media_info, video_id): - formats = [] - for media_variant in media_info.get('variants', []): - media_url = media_variant['url'] - if media_url.endswith('.m3u8'): - formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls')) - elif media_url.endswith('.mpd'): - formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash')) - else: - tbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000) - a_format = { - 'url': media_url, - 'format_id': 'http-%d' % tbr if tbr else 'http', - 'tbr': tbr, - } - # Reported bitRate may be zero - if not a_format['tbr']: - del a_format['tbr'] - - self._search_dimensions_in_video_url(a_format, media_url) - - formats.append(a_format) - return formats - - def _extract_mobile_formats(self, username, video_id): - webpage = self._download_webpage( - 'https://mobile.twitter.com/%s/status/%s' % (username, video_id), - video_id, 'Downloading mobile webpage', - headers={ - # A recent mobile UA is necessary for `gt` cookie - 'User-Agent': 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0', - }) - main_script_url = self._html_search_regex( - r'<script[^>]+src="([^"]+main\.[^"]+)"', webpage, 'main script URL') - main_script = self._download_webpage( - main_script_url, video_id, 'Downloading main script') - bearer_token = self._search_regex( - r'BEARER_TOKEN\s*:\s*"([^"]+)"', - main_script, 'bearer token') - # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id - api_data = self._download_json( - '%s/statuses/show/%s.json' % (self._API_BASE, video_id), - video_id, 'Downloading API data', - headers={ - 'Authorization': 'Bearer ' + bearer_token, - }) - media_info = try_get(api_data, lambda o: o['extended_entities']['media'][0]['video_info']) or {} - return self._parse_media_info(media_info, video_id) - def _real_extract(self, url): - path, video_id = re.search(self._VALID_URL, url).groups() - - config = None - formats = [] - duration = None - - urls = [url] - if path.startswith('cards/'): - urls.append('https://twitter.com/i/videos/' + video_id) - - for u in urls: - webpage = self._download_webpage( - u, video_id, headers={'Referer': 'https://twitter.com/'}) - - iframe_url = self._html_search_regex( - r'<iframe[^>]+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', - webpage, 'video iframe', default=None) - if iframe_url: - return self.url_result(iframe_url) - - config = self._parse_json(self._html_search_regex( - r'data-(?:player-)?config="([^"]+)"', webpage, - 'data player config', default='{}'), - video_id) - - if config.get('source_type') == 'vine': - return self.url_result(config['player_url'], 'Vine') - - periscope_url = PeriscopeIE._extract_url(webpage) - if periscope_url: - return self.url_result(periscope_url, PeriscopeIE.ie_key()) - - video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source') + status_id = self._match_id(url) + return self.url_result( + 'https://twitter.com/statuses/' + status_id, + TwitterIE.ie_key(), status_id) - if video_url: - if determine_ext(video_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls')) - else: - f = { - 'url': video_url, - } - - self._search_dimensions_in_video_url(f, video_url) - - formats.append(f) - - vmap_url = config.get('vmapUrl') or config.get('vmap_url') - if vmap_url: - formats.extend( - self._extract_formats_from_vmap_url(vmap_url, video_id)) - - media_info = None - for entity in config.get('status', {}).get('entities', []): - if 'mediaInfo' in entity: - media_info = entity['mediaInfo'] - - if media_info: - formats.extend(self._parse_media_info(media_info, video_id)) - duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9) - - username = config.get('user', {}).get('screen_name') - if username: - formats.extend(self._extract_mobile_formats(username, video_id)) - - if formats: - title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title') - thumbnail = config.get('posterImageUrl') or config.get('image_src') - duration = float_or_none(config.get('duration'), scale=1000) or duration - break - - if not formats: - headers = { - 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', - 'Referer': url, - } - ct0 = self._get_cookies(url).get('ct0') - if ct0: - headers['csrf_token'] = ct0.value - guest_token = self._download_json( - '%s/guest/activate.json' % self._API_BASE, video_id, - 'Downloading guest token', data=b'', - headers=headers)['guest_token'] - headers['x-guest-token'] = guest_token - self._set_cookie('api.twitter.com', 'gt', guest_token) - config = self._download_json( - '%s/videos/tweet/config/%s.json' % (self._API_BASE, video_id), - video_id, headers=headers) - track = config['track'] - vmap_url = track.get('vmapUrl') - if vmap_url: - formats = self._extract_formats_from_vmap_url(vmap_url, video_id) - else: - playback_url = track['playbackUrl'] - if determine_ext(playback_url) == 'm3u8': - formats = self._extract_m3u8_formats( - playback_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls') - else: - formats = [{ - 'url': playback_url, - }] - title = 'Twitter web player' - thumbnail = config.get('posterImage') - duration = float_or_none(track.get('durationMs'), scale=1000) - - self._remove_duplicate_formats(formats) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } - - -class TwitterIE(InfoExtractor): +class TwitterIE(TwitterBaseIE): IE_NAME = 'twitter' - _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?:i/web|(?P<user_id>[^/]+))/status/(?P<id>\d+)' - _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' - _TEMPLATE_STATUSES_URL = 'https://twitter.com/statuses/%s' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', @@ -291,10 +195,13 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"', + 'description': 'FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ', 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', 'duration': 12.922, + 'timestamp': 1442188653, + 'upload_date': '20150913', + 'age_limit': 18, }, }, { 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', @@ -316,19 +223,23 @@ class TwitterIE(InfoExtractor): 'id': '665052190608723968', 'ext': 'mp4', 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.', - 'description': 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."', + 'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ', 'uploader_id': 'starwars', 'uploader': 'Star Wars', + 'timestamp': 1447395772, + 'upload_date': '20151113', }, }, { 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', 'info_dict': { 'id': '705235433198714880', 'ext': 'mp4', - 'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.', - 'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."', + 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.", + 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns", 'uploader_id': 'BTNBrentYarina', 'uploader': 'Brent Yarina', + 'timestamp': 1456976204, + 'upload_date': '20160303', }, 'params': { # The same video as https://twitter.com/i/videos/tweet/705235433198714880 @@ -340,12 +251,14 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel', - 'description': 'JG on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"', + 'title': 'Simon Vertugo - BEAT PROD: @suhmeduh #Damndaniel', + 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'JG', - 'uploader_id': 'jaydingeer', + 'uploader': 'Simon Vertugo', + 'uploader_id': 'simonvertugo', 'duration': 30.0, + 'timestamp': 1455777459, + 'upload_date': '20160218', }, }, { 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', @@ -353,10 +266,9 @@ class TwitterIE(InfoExtractor): 'info_dict': { 'id': 'MIOxnrUteUd', 'ext': 'mp4', - 'title': 'Vince Mancini - Vine of the day', - 'description': 'Vince Mancini on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"', - 'uploader': 'Vince Mancini', - 'uploader_id': 'Filmdrunk', + 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン', + 'uploader': 'TAKUMA', + 'uploader_id': '1004126642786242560', 'timestamp': 1402826626, 'upload_date': '20140615', }, @@ -367,21 +279,22 @@ class TwitterIE(InfoExtractor): 'id': '719944021058060289', 'ext': 'mp4', 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.', - 'description': 'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"', - 'uploader_id': 'captainamerica', + 'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI', + 'uploader_id': 'CaptainAmerica', 'uploader': 'Captain America', 'duration': 3.17, + 'timestamp': 1460483005, + 'upload_date': '20160412', }, }, { 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', 'info_dict': { 'id': '1zqKVVlkqLaKB', 'ext': 'mp4', - 'title': 'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence', - 'description': 'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence https://t.co/EKrVgIXF3s"', + 'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence', 'upload_date': '20160923', - 'uploader_id': 'OPP_HSD', - 'uploader': 'Sgt Kerry Schmidt', + 'uploader_id': '1PmKqpJdOJQoY', + 'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police', 'timestamp': 1474613214, }, 'add_ie': ['Periscope'], @@ -392,10 +305,12 @@ class TwitterIE(InfoExtractor): 'id': '852138619213144067', 'ext': 'mp4', 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة', - 'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"', + 'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN', 'uploader': 'عالم الأخبار', 'uploader_id': 'news_al3alm', 'duration': 277.4, + 'timestamp': 1492000653, + 'upload_date': '20170412', }, }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', @@ -404,10 +319,12 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'Préfet de Guadeloupe on Twitter: "[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo"', + 'description': '[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo', 'uploader': 'Préfet de Guadeloupe', 'uploader_id': 'Prefet971', 'duration': 47.48, + 'timestamp': 1505803395, + 'upload_date': '20170919', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -420,10 +337,12 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 're:.*?Shep is on a roll today.*?', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'md5:63b036c228772523ae1924d5f8e5ed6b', + 'description': 'md5:37b9f2ff31720cef23b2bd42ee8a0f09', 'uploader': 'Lis Power', 'uploader_id': 'LisPower1', 'duration': 111.278, + 'timestamp': 1527623489, + 'upload_date': '20180529', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -435,88 +354,163 @@ class TwitterIE(InfoExtractor): 'ext': 'mp4', 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', 'thumbnail': r're:^https?://.*\.jpg', - 'description': 'md5:66d493500c013e3e2d434195746a7f78', + 'description': 'md5:6dfd341a3310fb97d80d2bf7145df976', 'uploader': 'Twitter', 'uploader_id': 'Twitter', 'duration': 61.567, + 'timestamp': 1548184644, + 'upload_date': '20190122', + }, + }, { + # not available in Periscope + 'url': 'https://twitter.com/ViviEducation/status/1136534865145286656', + 'info_dict': { + 'id': '1vOGwqejwoWxB', + 'ext': 'mp4', + 'title': 'Vivi - Vivi founder @lior_rauchy announcing our new student feedback tool live at @EduTECH_AU #EduTECH2019', + 'uploader': 'Vivi', + 'uploader_id': '1eVjYOLGkGrQL', }, + 'add_ie': ['TwitterBroadcast'], + }, { + # Twitch Clip Embed + 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - twid = mobj.group('id') - - webpage, urlh = self._download_webpage_handle( - self._TEMPLATE_STATUSES_URL % twid, twid) - - if 'twitter.com/account/suspended' in urlh.geturl(): - raise ExtractorError('Account suspended by Twitter.', expected=True) - - user_id = None - - redirect_mobj = re.match(self._VALID_URL, urlh.geturl()) - if redirect_mobj: - user_id = redirect_mobj.group('user_id') - - if not user_id: - user_id = mobj.group('user_id') - - username = remove_end(self._og_search_title(webpage), ' on Twitter') - - title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”') + twid = self._match_id(url) + status = self._call_api( + 'statuses/show/%s.json' % twid, twid, { + 'cards_platform': 'Web-12', + 'include_cards': 1, + 'include_reply_count': 1, + 'include_user_entities': 0, + 'tweet_mode': 'extended', + }) + title = description = status['full_text'].replace('\n', ' ') # strip 'https -_t.co_BJYgOjSeGA' junk from filenames title = re.sub(r'\s+(https?://[^ ]+)', '', title) + user = status.get('user') or {} + uploader = user.get('name') + if uploader: + title = '%s - %s' % (uploader, title) + uploader_id = user.get('screen_name') + + tags = [] + for hashtag in (try_get(status, lambda x: x['entities']['hashtags'], list) or []): + hashtag_text = hashtag.get('text') + if not hashtag_text: + continue + tags.append(hashtag_text) info = { - 'uploader_id': user_id, - 'uploader': username, - 'webpage_url': url, - 'description': '%s on Twitter: "%s"' % (username, description), - 'title': username + ' - ' + title, + 'id': twid, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': unified_timestamp(status.get('created_at')), + 'uploader_id': uploader_id, + 'uploader_url': 'https://twitter.com/' + uploader_id if uploader_id else None, + 'like_count': int_or_none(status.get('favorite_count')), + 'repost_count': int_or_none(status.get('retweet_count')), + 'comment_count': int_or_none(status.get('reply_count')), + 'age_limit': 18 if status.get('possibly_sensitive') else 0, + 'tags': tags, } - mobj = re.search(r'''(?x) - <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s* - <source[^>]+video-src="(?P<url>[^"]+)" - ''', webpage) - - if mobj: - more_info = mobj.group('more_info') - height = int_or_none(self._search_regex( - r'data-height="(\d+)"', more_info, 'height', fatal=False)) - width = int_or_none(self._search_regex( - r'data-width="(\d+)"', more_info, 'width', fatal=False)) - thumbnail = self._search_regex( - r'poster="([^"]+)"', more_info, 'poster', fatal=False) - info.update({ - 'id': twid, - 'url': mobj.group('url'), - 'height': height, - 'width': width, - 'thumbnail': thumbnail, - }) - return info + media = try_get(status, lambda x: x['extended_entities']['media'][0]) + if media and media.get('type') != 'photo': + video_info = media.get('video_info') or {} + + formats = [] + for variant in video_info.get('variants', []): + formats.extend(self._extract_variant_formats(variant, twid)) + self._sort_formats(formats) + + thumbnails = [] + media_url = media.get('media_url_https') or media.get('media_url') + if media_url: + def add_thumbnail(name, size): + thumbnails.append({ + 'id': name, + 'url': update_url_query(media_url, {'name': name}), + 'width': int_or_none(size.get('w') or size.get('width')), + 'height': int_or_none(size.get('h') or size.get('height')), + }) + for name, size in media.get('sizes', {}).items(): + add_thumbnail(name, size) + add_thumbnail('orig', media.get('original_info') or {}) - twitter_card_url = None - if 'class="PlayableMedia' in webpage: - twitter_card_url = '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid) - else: - twitter_card_iframe_url = self._search_regex( - r'data-full-card-iframe-url=([\'"])(?P<url>(?:(?!\1).)+)\1', - webpage, 'Twitter card iframe URL', default=None, group='url') - if twitter_card_iframe_url: - twitter_card_url = compat_urlparse.urljoin(url, twitter_card_iframe_url) - - if twitter_card_url: info.update({ - '_type': 'url_transparent', - 'ie_key': 'TwitterCard', - 'url': twitter_card_url, + 'formats': formats, + 'thumbnails': thumbnails, + 'duration': float_or_none(video_info.get('duration_millis'), 1000), }) - return info - - raise ExtractorError('There\'s no video in this tweet.') + else: + card = status.get('card') + if card: + binding_values = card['binding_values'] + + def get_binding_value(k): + o = binding_values.get(k) or {} + return try_get(o, lambda x: x[x['type'].lower() + '_value']) + + card_name = card['name'].split(':')[-1] + if card_name == 'amplify': + formats = self._extract_formats_from_vmap_url( + get_binding_value('amplify_url_vmap'), + get_binding_value('amplify_content_id') or twid) + self._sort_formats(formats) + + thumbnails = [] + for suffix in ('_small', '', '_large', '_x_large', '_original'): + image = get_binding_value('player_image' + suffix) or {} + image_url = image.get('url') + if not image_url or '/player-placeholder' in image_url: + continue + thumbnails.append({ + 'id': suffix[1:] if suffix else 'medium', + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + info.update({ + 'formats': formats, + 'thumbnails': thumbnails, + 'duration': int_or_none(get_binding_value( + 'content_duration_seconds')), + }) + elif card_name == 'player': + info.update({ + '_type': 'url', + 'url': get_binding_value('player_url'), + }) + elif card_name == 'periscope_broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + }) + elif card_name == 'broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + }) + else: + raise ExtractorError('Unsupported Twitter Card.') + else: + expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) + if not expanded_url: + raise ExtractorError("There's no video in this tweet.") + info.update({ + '_type': 'url', + 'url': expanded_url, + }) + return info class TwitterAmplifyIE(TwitterBaseIE): @@ -573,3 +567,27 @@ class TwitterAmplifyIE(TwitterBaseIE): 'formats': formats, 'thumbnails': thumbnails, } + + +class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): + IE_NAME = 'twitter:broadcast' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P<id>[0-9a-zA-Z]{13})' + + def _real_extract(self, url): + broadcast_id = self._match_id(url) + broadcast = self._call_api( + 'broadcasts/show.json', broadcast_id, + {'ids': broadcast_id})['broadcasts'][broadcast_id] + info = self._parse_broadcast_data(broadcast, broadcast_id) + media_key = broadcast['media_key'] + source = self._call_api( + 'live_video_stream/status/' + media_key, media_key)['source'] + m3u8_url = source.get('noRedirectPlaybackUrl') or source['location'] + if '/live_video_stream/geoblocked/' in m3u8_url: + self.raise_geo_restricted() + m3u8_id = compat_parse_qs(compat_urllib_parse_urlparse( + m3u8_url).query).get('type', [None])[0] + state, width, height = self._extract_common_format_info(broadcast) + info['formats'] = self._extract_pscp_m3u8_formats( + m3u8_url, broadcast_id, m3u8_id, state, width, height) + return info