Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[bbc] Fix BBCCoUkIPlayerPlaylistIE #28360

Closed
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 42 additions & 7 deletions youtube_dl/extractor/bbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1318,7 +1318,7 @@ def _entries(self, webpage, url, playlist_id):
if single_page:
return
next_page = self._search_regex(
r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2',
r'(?:<li[^>]+class=(["\'])pagination_+next\1[^>]*>\s*<a|<a[^>]+\baria-label=(["\'])Next Page\2)[^>]+href=(["\'])(?P<url>(?:(?!\3).)+)\3',
webpage, 'next page url', default=None, group='url')
if not next_page:
break
Expand All @@ -1328,6 +1328,7 @@ def _entries(self, webpage, url, playlist_id):

def _real_extract(self, url):
playlist_id = self._match_id(url)
self._playlist_id = playlist_id

webpage = self._download_webpage(url, playlist_id)

Expand All @@ -1342,7 +1343,10 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
IE_NAME = 'bbc.co.uk:iplayer:playlist'
_VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX
_URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s'
_VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)'
_VIDEO_ID_TEMPLATE = r'"href":\s*"/iplayer/episode/(%s)/'
_SERIES_ID_TEMPLATE = '/iplayer/episodes/%s/.+[?&]seriesId=(%s)'
_SERIES_URL_TEMPLATE = 'http://www.bbc.co.uk/programmes/%s/episodes/player'

_TESTS = [{
'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v',
'info_dict': {
Expand All @@ -1358,16 +1362,47 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE):
'info_dict': {
'id': 'p02tcc32',
'title': 'Bohemian Icons',
'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7',
'description': 'md5:8b60017680e9f3115e79e0c20697a585',
},
'playlist_mincount': 10,
}, {
# Playlist with more than one series/season
'url': 'https://www.bbc.co.uk/iplayer/episodes/b094m5t9/doctor-foster',
'info_dict': {
'id': 'b094m5t9',
'title': 'Doctor Foster',
'description': 'A trusted GP sees her charmed life explode when she suspects her husband of an affair.',
},
'playlist_mincount': 10,
}, {
# Playlist with more than one page
'url': 'https://www.bbc.co.uk/iplayer/episodes/m0004c4v/beechgrove',
'info_dict': {
'id': 'm0004c4v',
'title': 'Beechgrove',
'description': 'Gardening show that celebrates Scottish horticulture and growing conditions.',
},
'playlist_mincount': 37,
}]

def _entries(self, webpage, url, playlist_id):
for entry in super(BBCCoUkIPlayerPlaylistIE, self)._entries(webpage, url, playlist_id):
yield entry
for series_id in re.findall(self._SERIES_ID_TEMPLATE % (playlist_id, BBCCoUkIE._ID_REGEX), webpage):
yield self.url_result(self._SERIES_URL_TEMPLATE % series_id)

def _extract_title_and_description(self, webpage):
title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
description = self._search_regex(
r'<p[^>]+class=(["\'])subtitle\1[^>]*>(?P<value>[^<]+)</p>',
webpage, 'description', fatal=False, group='value')
redux_state = self._parse_json(self._html_search_regex(
r'<script[^>]+id=(["\'])tvip-script-app-store\1[^>]*>[^<]*_REDUX_STATE__\s*=\s*(?P<json>[^<]+)\s*;\s*<',
webpage, 'redux state', default='{}', group='json'), self._playlist_id, fatal=False)
if redux_state:
redux_hdr = redux_state.get('header') or {}
redux_hdr.update(redux_state.get('page') or {})
redux_state = redux_hdr
title = redux_state.get('title') or self._og_search_title(webpage, fatal=False)
description = redux_state.get('description') or \
self._html_search_meta('description', webpage, default=None) or \
self._og_search_description(webpage)
return title, description


Expand Down