commit dfe0a3a9d2e07ac1e5ad221912d03b999ebb4d75
parent c976873c5b2912c06ff53e5193640ee8627edee4
Author: Sergey M․ <dstftw@gmail.com>
Date:   Fri,  7 Dec 2018 03:27:11 +0700

[lecturio] Add extractor (closes #18405)

Diffstat:
Myoutube_dl/extractor/extractors.py | 4++++
Ayoutube_dl/extractor/lecturio.py | 186+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 190 insertions(+), 0 deletions(-)

diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py @@ -553,6 +553,10 @@ from .lcp import ( ) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE +from .lecturio import ( + LecturioIE, + LecturioCourseIE, +) from .leeco import ( LeIE, LePlaylistIE, diff --git a/youtube_dl/extractor/lecturio.py b/youtube_dl/extractor/lecturio.py @@ -0,0 +1,186 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + extract_attributes, + ExtractorError, + float_or_none, + int_or_none, + str_or_none, + url_or_none, + urlencode_postdata, + urljoin, +) + + +class LecturioBaseIE(InfoExtractor): + _LOGIN_URL = 'https://app.lecturio.com/en/login' + _NETRC_MACHINE = 'lecturio' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + # Sets some cookies + _, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(url_handle): + return self._LOGIN_URL not in compat_str(url_handle.geturl()) + + # Already logged in + if is_logged(urlh): + return + + login_form = { + 'signin[email]': username, + 'signin[password]': password, + 'signin[remember]': 'on', + } + + response, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(login_form)) + + # Logged in successfully + if is_logged(urlh): + return + + errors = self._html_search_regex( + r'(?s)<ul[^>]+class=["\']error_list[^>]+>(.+?)</ul>', response, + 'errors', default=None) + if errors: + raise ExtractorError('Unable to login: %s' % errors, expected=True) + raise ExtractorError('Unable to log in') + + +class LecturioIE(LecturioBaseIE): + _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P<id>[^/?#&]+)\.lecture' + _TEST = { + 'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos', + 'md5': 'f576a797a5b7a5e4e4bbdfc25a6a6870', + 'info_dict': { + 'id': '39634', + 'ext': 'mp4', + 'title': 'Important Concepts and Terms – Introduction to Microbiology', + }, + 'skip': 'Requires lecturio account credentials', + } + + _CC_LANGS = { + 'German': 'de', + 'English': 'en', + 'Spanish': 'es', + 'French': 'fr', + 'Polish': 'pl', + 'Russian': 'ru', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://app.lecturio.com/en/lecture/%s/player.html' % display_id, + display_id) + + lecture_id = self._search_regex( + r'lecture_id\s*=\s*(?:L_)?(\d+)', webpage, 'lecture id') + + api_url = self._search_regex( + r'lectureDataLink\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'api url', group='url') + + video = self._download_json(api_url, display_id) + + title = video['title'].strip() + + formats = [] + for format_ in video['content']['media']: + if not isinstance(format_, dict): + continue + file_ = format_.get('file') + if not file_: + continue + ext = determine_ext(file_) + if ext == 'smil': + # smil contains only broken RTMP formats anyway + continue + file_url = url_or_none(file_) + if not file_url: + continue + label = str_or_none(format_.get('label')) + filesize = int_or_none(format_.get('fileSize')) + formats.append({ + 'url': file_url, + 'format_id': label, + 'filesize': float_or_none(filesize, invscale=1000) + }) + self._sort_formats(formats) + + subtitles = {} + automatic_captions = {} + cc = self._parse_json( + self._search_regex( + r'subtitleUrls\s*:\s*({.+?})\s*,', webpage, 'subtitles', + default='{}'), display_id, fatal=False) + for cc_label, cc_url in cc.items(): + cc_url = url_or_none(cc_url) + if not cc_url: + continue + sub_dict = automatic_captions if 'auto-translated' in cc_label else subtitles + lang = self._search_regex( + r'/([a-z]{2})_', cc_url, 'lang', default=cc_label.split()[0]) + sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({ + 'url': cc_url, + }) + + return { + 'id': lecture_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, + } + + +class LecturioCourseIE(LecturioBaseIE): + _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P<id>[^/?#&]+)\.course' + _TEST = { + 'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/', + 'info_dict': { + 'id': 'microbiology-introduction', + 'title': 'Microbiology: Introduction', + }, + 'playlist_count': 45, + 'skip': 'Requires lecturio account credentials', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + entries = [] + for mobj in re.finditer( + r'(?s)<[^>]+\bdata-url=(["\'])(?:(?!\1).)+\.lecture\b[^>]+>', + webpage): + params = extract_attributes(mobj.group(0)) + lecture_url = urljoin(url, params.get('data-url')) + lecture_id = params.get('data-id') + entries.append(self.url_result( + lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id)) + + title = self._search_regex( + r'<span[^>]+class=["\']content-title[^>]+>([^<]+)', webpage, + 'title', default=None) + + return self.playlist_result(entries, display_id, title)