Skip to content

Commit

Permalink
[ie/mediasite] Provide untimed transcript in ttml format
Browse files Browse the repository at this point in the history
  • Loading branch information
kclauhk committed Dec 26, 2024
1 parent 363863e commit a9df4a4
Showing 1 changed file with 34 additions and 7 deletions.
41 changes: 34 additions & 7 deletions yt_dlp/extractor/mediasite.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,25 @@ def __extract_slides(self, *, stream_id, snum, stream, duration, images):
'fragment_base_url': slide_base_url,
}

def _get_transcript_txt(self, transcript_url, resource_id, lang_code='und', lang_name=None):
ts = {
'name': join_nonempty(lang_name, '(Untimed)', delim=' '),
'ext': 'ttml',
}
if ((self.get_param('writesubtitles') or self.get_param('writeautomaticsub'))
and 'ttml' in self.get_param('subtitlesformat')):
if transcript := self._download_webpage(
transcript_url, resource_id, note='Downloading transcript', fatal=False):
d = ('<?xml version="1.0" encoding="utf-8" ?>\n'
f'<tt xml:lang="{lang_code}" xmlns="http://www.w3.org/ns/ttml" />\n'
'<head>\n</head>\n<body>\n<div>\n<p xml:id="transcript">\n'
+ transcript.strip()
+ '\n</p>\n</div>\n</body>\n</tt>')
return {'data': d, **ts}
else:
return {'url': transcript_url, **ts}
return {}

def _real_extract(self, url):
url, data = unsmuggle_url(url, {})
mobj = self._match_valid_url(url)
Expand Down Expand Up @@ -370,14 +389,22 @@ def _real_extract(self, url):
else:
subtitles.setdefault(lang_code, []).append(t)
if transcript_url := presentation.get('TranscriptUrl'):
if 'playbackTicket' not in transcript_url:
transcript_url = join_nonempty(
transcript_url, traverse_obj(presentation, ('Streams', 0, 'SlidePlaybackTicketId', {str_or_none})),
delim='?playbackTicket=')
if determine_ext(transcript_url) != 'txt':
if len(transcripts) == 1:
(captions or subtitles).setdefault(lang_code, []).append({
'url': transcript_url,
'name': lang_name,
})
else:
subtitles.setdefault('und', []).append({'url': transcript_url})
ts = {'url': transcript_url}
else:
ts = self._get_transcript_txt(
transcript_url, resource_id, *([lang_code, lang_name] if len(transcripts) == 1 else ['und']))
if len(transcripts) == 1:
(captions or subtitles).setdefault(lang_code, []).insert(0, {
'name': lang_name,
**ts,
})
else:
subtitles.setdefault('und', []).insert(0, ts)

return {
'id': resource_id,
Expand Down

0 comments on commit a9df4a4

Please sign in to comment.