diff --git a/src/yt2doc/extraction/extractor.py b/src/yt2doc/extraction/extractor.py index e8af7bf..a6569b1 100644 --- a/src/yt2doc/extraction/extractor.py +++ b/src/yt2doc/extraction/extractor.py @@ -11,12 +11,12 @@ class Extractor: def __init__( self, - video_info_extractor: youtube_interfaces.IYtVideoInfoExtractor, + media_info_extractor: youtube_interfaces.IYtMediaInfoExtractor, transcriber: transcription_interfaces.ITranscriber, file_cache: interfaces.IFileCache, ignore_source_chapters: bool, ) -> None: - self.yt_dlp_adapter = video_info_extractor + self.yt_dlp_adapter = media_info_extractor self.transcriber = transcriber self.file_cache = file_cache self.ignore_source_chapters = ignore_source_chapters @@ -28,16 +28,16 @@ def extract_by_chapter( ) -> interfaces.ChapteredTranscript: logger.info(f"Extracting video {video_url} by chapter.") - video_info = self.yt_dlp_adapter.extract_video_info(video_url=video_url) + media_info = self.yt_dlp_adapter.extract_media_info(video_url=video_url) if self.ignore_source_chapters: - video_info.chapters = [] + media_info.chapters = [] if ( not skip_cache and ( cached_chaptered_transcript := self.file_cache.get_chaptered_transcript( - video_id=video_info.video_id + video_id=media_info.video_id ) ) is not None @@ -52,7 +52,7 @@ def extract_by_chapter( with Timer() as transcribe_timer: transcript = self.transcriber.transcribe( audio_path=audio_path, - video_info=video_info, + media_info=media_info, ) transcripts_by_chapter = [ interfaces.TranscriptChapter( @@ -65,14 +65,16 @@ def extract_by_chapter( chaptered_transcript = interfaces.ChapteredTranscript( url=video_url, - title=video_info.title, + title=media_info.title, + webpage_url=media_info.webpage_url, + webpage_url_domain=media_info.webpage_url_domain, chapters=transcripts_by_chapter, - chaptered_at_source=len(video_info.chapters) > 0, + chaptered_at_source=len(media_info.chapters) > 0, language=transcript.language, ) self.file_cache.cache_chaptered_transcript( - video_id=video_info.video_id, + video_id=media_info.video_id, transcript=chaptered_transcript, ) diff --git a/src/yt2doc/extraction/interfaces.py b/src/yt2doc/extraction/interfaces.py index a9f1992..4ba4d51 100644 --- a/src/yt2doc/extraction/interfaces.py +++ b/src/yt2doc/extraction/interfaces.py @@ -13,6 +13,8 @@ class TranscriptChapter(BaseModel): class ChapteredTranscript(BaseModel): url: str title: str + webpage_url: str + webpage_url_domain: str language: str chapters: typing.Sequence[TranscriptChapter] chaptered_at_source: bool diff --git a/src/yt2doc/factories.py b/src/yt2doc/factories.py index 2d6fc54..fcdd3f6 100644 --- a/src/yt2doc/factories.py +++ b/src/yt2doc/factories.py @@ -66,13 +66,13 @@ def get_yt2doc( else: formatter = MarkdownFormatter(paragraphs_segmenter=paragraphs_segmenter) - video_info_extractor = MediaInfoExtractor(temp_dir=temp_dir) + media_info_extractor = MediaInfoExtractor(temp_dir=temp_dir) transcriber = Transcriber( temp_dir=temp_dir, whisper_adapter=whisper_adapter, ) extractor = Extractor( - video_info_extractor=video_info_extractor, + media_info_extractor=media_info_extractor, transcriber=transcriber, file_cache=file_cache, ignore_source_chapters=ignore_source_chapters, diff --git a/src/yt2doc/formatting/formatter.py b/src/yt2doc/formatting/formatter.py index 51a7b2d..9e6027c 100644 --- a/src/yt2doc/formatting/formatter.py +++ b/src/yt2doc/formatting/formatter.py @@ -11,20 +11,29 @@ class MarkdownFormatter: def __init__( self, paragraphs_segmenter: interfaces.IParagraphsSegmenter, + # timestamp_paragraphs: bool, topic_segmenter: typing.Optional[interfaces.ITopicSegmenter] = None, ) -> None: self.paragraphs_segmenter = paragraphs_segmenter self.topic_segmenter = topic_segmenter self.video_title_template = "# {name}" self.chapter_title_template = "## {name}" + # self.timestamp_paragraphs = timestamp_paragraphs @staticmethod def _paragraphs_to_text( paragraphs: typing.Sequence[typing.Sequence[interfaces.Sentence]], + # timestamp_paragraphs: bool, + # webpage_url: str, + # webpage_url_domain: str, ) -> str: paragraph_texts = [] for paragraph in paragraphs: + first_sentence = paragraph[0] paragraph_text = "".join(sentence.text for sentence in paragraph) + # if timestamp_paragraphs: + # if webpage_url_domain == "youtube.com": + # timestamp_prefix = "[\({}\)]()" paragraph_texts.append(paragraph_text) return "\n\n".join(paragraph_texts) diff --git a/src/yt2doc/media/interfaces.py b/src/yt2doc/media/interfaces.py index c6ce725..de48aba 100644 --- a/src/yt2doc/media/interfaces.py +++ b/src/yt2doc/media/interfaces.py @@ -14,6 +14,8 @@ class MediaChapter(BaseModel): class MediaInfo(BaseModel): video_id: str title: str + webpage_url: str + webpage_url_domain: str chapters: typing.Sequence[MediaChapter] description: str @@ -23,7 +25,7 @@ class YtPlaylistInfo(BaseModel): video_urls: typing.Sequence[str] -class IYtVideoInfoExtractor(typing.Protocol): - def extract_video_info(self, video_url: str) -> MediaInfo: ... +class IYtMediaInfoExtractor(typing.Protocol): + def extract_media_info(self, video_url: str) -> MediaInfo: ... def extract_audio(self, video_url: str) -> Path: ... def extract_playlist_info(self, playlist_url: str) -> YtPlaylistInfo: ... diff --git a/src/yt2doc/media/media_info_extractor.py b/src/yt2doc/media/media_info_extractor.py index 325255f..55baa52 100644 --- a/src/yt2doc/media/media_info_extractor.py +++ b/src/yt2doc/media/media_info_extractor.py @@ -16,6 +16,7 @@ class YtDLPResponse(BaseModel): video_id: str = Field(alias="id") webpage_url: str + webpage_url_domain: str title: str description: str chapters: typing.Optional[typing.Sequence[interfaces.MediaChapter]] = None @@ -67,7 +68,7 @@ class MediaInfoExtractor: def __init__(self, temp_dir: Path): self.temp_dir = temp_dir - def extract_video_info(self, video_url: str) -> interfaces.MediaInfo: + def extract_media_info(self, video_url: str) -> interfaces.MediaInfo: ydl_opts = { "quiet": True, } @@ -80,6 +81,8 @@ def extract_video_info(self, video_url: str) -> interfaces.MediaInfo: return interfaces.MediaInfo( video_id=parsed_response.video_id, title=parsed_response.title, + webpage_url=parsed_response.webpage_url, + webpage_url_domain=parsed_response.webpage_url_domain, chapters=_merge_short_chapters(parsed_response.chapters or []), description=parsed_response.description, ) diff --git a/src/yt2doc/transcription/interfaces.py b/src/yt2doc/transcription/interfaces.py index 431eb64..d29b735 100644 --- a/src/yt2doc/transcription/interfaces.py +++ b/src/yt2doc/transcription/interfaces.py @@ -33,5 +33,5 @@ def transcribe( class ITranscriber(typing.Protocol): def transcribe( - self, audio_path: Path, video_info: youtube_interfaces.MediaInfo + self, audio_path: Path, media_info: youtube_interfaces.MediaInfo ) -> Transcription: ... diff --git a/src/yt2doc/transcription/transcriber.py b/src/yt2doc/transcription/transcriber.py index a6d2ebd..d586e83 100644 --- a/src/yt2doc/transcription/transcriber.py +++ b/src/yt2doc/transcription/transcriber.py @@ -69,18 +69,18 @@ def _clean_title(title: str, punctuations: punctuations.Punctuations) -> str: def _get_initial_prompt( self, language_code: str, - video_info: youtube_interfaces.MediaInfo, + media_info: youtube_interfaces.MediaInfo, ) -> str: punctuations_ = punctuations.get_punctuations(language_code=language_code) cleaned_title = self._clean_title( - title=video_info.title, + title=media_info.title, punctuations=punctuations_, ) cleaned_video_description = self._clean_video_description( - video_info.description, punctuations=punctuations_ + media_info.description, punctuations=punctuations_ ) chapter_titles = f"{punctuations_.comma}".join( - c.title for c in video_info.chapters + c.title for c in media_info.chapters ) return f"{cleaned_title}{punctuations_.full_stop} {cleaned_video_description} {chapter_titles}" @@ -114,14 +114,14 @@ def _convert_audio_to_wav(audio_path: Path) -> Path: return wav_audio_path def transcribe( - self, audio_path: Path, video_info: youtube_interfaces.MediaInfo + self, audio_path: Path, media_info: youtube_interfaces.MediaInfo ) -> interfaces.Transcription: wav_audio_path = self._convert_audio_to_wav(audio_path=audio_path) language_code = self.whisper_adapter.detect_language(audio_path=wav_audio_path) initial_prompt = self._get_initial_prompt( language_code=language_code, - video_info=video_info, + media_info=media_info, ) logger.info(f"Initial prompt: {initial_prompt}") @@ -133,8 +133,8 @@ def transcribe( rounded_full_audio_duration = round(full_audio_duration, 2) current_timestamp = 0.0 - if len(video_info.chapters) > 0: - chapters = video_info.chapters + if len(media_info.chapters) > 0: + chapters = media_info.chapters else: chapters = [ youtube_interfaces.MediaChapter(