commit c701472fc99c90f6ef4fdf0f0169a342f2e9e892
parent 19591facea9abe965a734224bea33948bb57b5f4
Author: Sergey M․ <dstftw@gmail.com>
Date:   Sun,  7 Apr 2019 02:15:25 +0700

[platzi] Add extractor (closes #20562)

Diffstat:
Myoutube_dl/extractor/extractors.py | 4++++
Ayoutube_dl/extractor/platzi.py | 217+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 221 insertions(+), 0 deletions(-)

diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py @@ -868,6 +868,10 @@ from .picarto import ( from .piksel import PikselIE from .pinkbike import PinkbikeIE from .pladform import PladformIE +from .platzi import ( + PlatziIE, + PlatziCourseIE, +) from .playfm import PlayFMIE from .playplustv import PlayPlusTVIE from .plays import PlaysTVIE diff --git a/youtube_dl/extractor/platzi.py b/youtube_dl/extractor/platzi.py @@ -0,0 +1,217 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_b64decode, + compat_str, +) +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + str_or_none, + try_get, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class PlatziIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + platzi\.com/clases| # es version + courses\.platzi\.com/classes # en version + )/[^/]+/(?P<id>\d+)-[^/?\#&]+ + ''' + _LOGIN_URL = 'https://platzi.com/login/' + _NETRC_MACHINE = 'platzi' + + _TESTS = [{ + 'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/', + 'md5': '8f56448241005b561c10f11a595b37e3', + 'info_dict': { + 'id': '12074', + 'ext': 'mp4', + 'title': 'Creando nuestra primera página', + 'description': 'md5:4c866e45034fc76412fbf6e60ae008bc', + 'duration': 420, + }, + 'skip': 'Requires platzi account credentials', + }, { + 'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/', + 'info_dict': { + 'id': '13430', + 'ext': 'mp4', + 'title': 'Background', + 'description': 'md5:49c83c09404b15e6e71defaf87f6b305', + 'duration': 360, + }, + 'skip': 'Requires platzi account credentials', + 'params': { + 'skip_download': True, + }, + }] + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'email': username, + 'password': password, + }) + + urlh = self._request_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Referer': self._LOGIN_URL}) + + # login succeeded + if 'platzi.com/login' not in compat_str(urlh.geturl()): + return + + login_error = self._webpage_read_content( + urlh, self._LOGIN_URL, None, 'Downloading login error page') + + login = self._parse_json( + self._search_regex( + r'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error, 'login'), + None) + + for kind in ('error', 'password', 'nonFields'): + error = str_or_none(login.get('%sError' % kind)) + if error: + raise ExtractorError( + 'Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_extract(self, url): + lecture_id = self._match_id(url) + + webpage = self._download_webpage(url, lecture_id) + + data = self._parse_json( + self._search_regex( + r'client_data\s*=\s*({.+?})\s*;', webpage, 'client data'), + lecture_id) + + material = data['initialState']['material'] + desc = material['description'] + title = desc['title'] + + formats = [] + for server_id, server in material['videos'].items(): + if not isinstance(server, dict): + continue + for format_id in ('hls', 'dash'): + format_url = url_or_none(server.get(format_id)) + if not format_url: + continue + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, lecture_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + note='Downloading %s m3u8 information' % server_id, + fatal=False)) + elif format_id == 'dash': + formats.extend(self._extract_mpd_formats( + format_url, lecture_id, mpd_id=format_id, + note='Downloading %s MPD manifest' % server_id, + fatal=False)) + self._sort_formats(formats) + + content = str_or_none(desc.get('content')) + description = (clean_html(compat_b64decode(content).decode('utf-8')) + if content else None) + duration = int_or_none(material.get('duration'), invscale=60) + + return { + 'id': lecture_id, + 'title': title, + 'description': description, + 'duration': duration, + 'formats': formats, + } + + +class PlatziCourseIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + platzi\.com/clases| # es version + courses\.platzi\.com/classes # en version + )/(?P<id>[^/?\#&]+) + ''' + _TESTS = [{ + 'url': 'https://platzi.com/clases/next-js/', + 'info_dict': { + 'id': '1311', + 'title': 'Curso de Next.js', + }, + 'playlist_count': 22, + }, { + 'url': 'https://courses.platzi.com/classes/communication-codestream/', + 'info_dict': { + 'id': '1367', + 'title': 'Codestream Course', + }, + 'playlist_count': 14, + }] + + @classmethod + def suitable(cls, url): + return False if PlatziIE.suitable(url) else super(PlatziCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_name = self._match_id(url) + + webpage = self._download_webpage(url, course_name) + + props = self._parse_json( + self._search_regex(r'data\s*=\s*({.+?})\s*;', webpage, 'data'), + course_name)['initialProps'] + + entries = [] + for chapter_num, chapter in enumerate(props['concepts'], 1): + if not isinstance(chapter, dict): + continue + materials = chapter.get('materials') + if not materials or not isinstance(materials, list): + continue + chapter_title = chapter.get('title') + chapter_id = str_or_none(chapter.get('id')) + for material in materials: + if not isinstance(material, dict): + continue + if material.get('material_type') != 'video': + continue + video_url = urljoin(url, material.get('url')) + if not video_url: + continue + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'title': str_or_none(material.get('name')), + 'id': str_or_none(material.get('id')), + 'ie_key': PlatziIE.ie_key(), + 'chapter': chapter_title, + 'chapter_number': chapter_num, + 'chapter_id': chapter_id, + }) + + course_id = compat_str(try_get(props, lambda x: x['course']['id'])) + course_title = try_get(props, lambda x: x['course']['name'], compat_str) + + return self.playlist_result(entries, course_id, course_title)