Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
shun-liang committed Nov 5, 2024
1 parent af4ef0f commit 7af3f09
Show file tree
Hide file tree
Showing 8 changed files with 41 additions and 23 deletions.
20 changes: 11 additions & 9 deletions src/yt2doc/extraction/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
class Extractor:
def __init__(
self,
video_info_extractor: youtube_interfaces.IYtVideoInfoExtractor,
media_info_extractor: youtube_interfaces.IYtMediaInfoExtractor,
transcriber: transcription_interfaces.ITranscriber,
file_cache: interfaces.IFileCache,
ignore_source_chapters: bool,
) -> None:
self.yt_dlp_adapter = video_info_extractor
self.yt_dlp_adapter = media_info_extractor
self.transcriber = transcriber
self.file_cache = file_cache
self.ignore_source_chapters = ignore_source_chapters
Expand All @@ -28,16 +28,16 @@ def extract_by_chapter(
) -> interfaces.ChapteredTranscript:
logger.info(f"Extracting video {video_url} by chapter.")

video_info = self.yt_dlp_adapter.extract_video_info(video_url=video_url)
media_info = self.yt_dlp_adapter.extract_media_info(video_url=video_url)

if self.ignore_source_chapters:
video_info.chapters = []
media_info.chapters = []

if (
not skip_cache
and (
cached_chaptered_transcript := self.file_cache.get_chaptered_transcript(
video_id=video_info.video_id
video_id=media_info.video_id
)
)
is not None
Expand All @@ -52,7 +52,7 @@ def extract_by_chapter(
with Timer() as transcribe_timer:
transcript = self.transcriber.transcribe(
audio_path=audio_path,
video_info=video_info,
media_info=media_info,
)
transcripts_by_chapter = [
interfaces.TranscriptChapter(
Expand All @@ -65,14 +65,16 @@ def extract_by_chapter(

chaptered_transcript = interfaces.ChapteredTranscript(
url=video_url,
title=video_info.title,
title=media_info.title,
webpage_url=media_info.webpage_url,
webpage_url_domain=media_info.webpage_url_domain,
chapters=transcripts_by_chapter,
chaptered_at_source=len(video_info.chapters) > 0,
chaptered_at_source=len(media_info.chapters) > 0,
language=transcript.language,
)

self.file_cache.cache_chaptered_transcript(
video_id=video_info.video_id,
video_id=media_info.video_id,
transcript=chaptered_transcript,
)

Expand Down
2 changes: 2 additions & 0 deletions src/yt2doc/extraction/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ class TranscriptChapter(BaseModel):
class ChapteredTranscript(BaseModel):
url: str
title: str
webpage_url: str
webpage_url_domain: str
language: str
chapters: typing.Sequence[TranscriptChapter]
chaptered_at_source: bool
Expand Down
4 changes: 2 additions & 2 deletions src/yt2doc/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,13 +66,13 @@ def get_yt2doc(
else:
formatter = MarkdownFormatter(paragraphs_segmenter=paragraphs_segmenter)

video_info_extractor = MediaInfoExtractor(temp_dir=temp_dir)
media_info_extractor = MediaInfoExtractor(temp_dir=temp_dir)
transcriber = Transcriber(
temp_dir=temp_dir,
whisper_adapter=whisper_adapter,
)
extractor = Extractor(
video_info_extractor=video_info_extractor,
media_info_extractor=media_info_extractor,
transcriber=transcriber,
file_cache=file_cache,
ignore_source_chapters=ignore_source_chapters,
Expand Down
9 changes: 9 additions & 0 deletions src/yt2doc/formatting/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,29 @@ class MarkdownFormatter:
def __init__(
self,
paragraphs_segmenter: interfaces.IParagraphsSegmenter,
# timestamp_paragraphs: bool,
topic_segmenter: typing.Optional[interfaces.ITopicSegmenter] = None,
) -> None:
self.paragraphs_segmenter = paragraphs_segmenter
self.topic_segmenter = topic_segmenter
self.video_title_template = "# {name}"
self.chapter_title_template = "## {name}"
# self.timestamp_paragraphs = timestamp_paragraphs

@staticmethod
def _paragraphs_to_text(
paragraphs: typing.Sequence[typing.Sequence[interfaces.Sentence]],
# timestamp_paragraphs: bool,
# webpage_url: str,
# webpage_url_domain: str,
) -> str:
paragraph_texts = []
for paragraph in paragraphs:
first_sentence = paragraph[0]
paragraph_text = "".join(sentence.text for sentence in paragraph)
# if timestamp_paragraphs:
# if webpage_url_domain == "youtube.com":
# timestamp_prefix = "[\({}\)]()"
paragraph_texts.append(paragraph_text)
return "\n\n".join(paragraph_texts)

Expand Down
6 changes: 4 additions & 2 deletions src/yt2doc/media/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ class MediaChapter(BaseModel):
class MediaInfo(BaseModel):
video_id: str
title: str
webpage_url: str
webpage_url_domain: str
chapters: typing.Sequence[MediaChapter]
description: str

Expand All @@ -23,7 +25,7 @@ class YtPlaylistInfo(BaseModel):
video_urls: typing.Sequence[str]


class IYtVideoInfoExtractor(typing.Protocol):
def extract_video_info(self, video_url: str) -> MediaInfo: ...
class IYtMediaInfoExtractor(typing.Protocol):
def extract_media_info(self, video_url: str) -> MediaInfo: ...
def extract_audio(self, video_url: str) -> Path: ...
def extract_playlist_info(self, playlist_url: str) -> YtPlaylistInfo: ...
5 changes: 4 additions & 1 deletion src/yt2doc/media/media_info_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
class YtDLPResponse(BaseModel):
video_id: str = Field(alias="id")
webpage_url: str
webpage_url_domain: str
title: str
description: str
chapters: typing.Optional[typing.Sequence[interfaces.MediaChapter]] = None
Expand Down Expand Up @@ -67,7 +68,7 @@ class MediaInfoExtractor:
def __init__(self, temp_dir: Path):
self.temp_dir = temp_dir

def extract_video_info(self, video_url: str) -> interfaces.MediaInfo:
def extract_media_info(self, video_url: str) -> interfaces.MediaInfo:
ydl_opts = {
"quiet": True,
}
Expand All @@ -80,6 +81,8 @@ def extract_video_info(self, video_url: str) -> interfaces.MediaInfo:
return interfaces.MediaInfo(
video_id=parsed_response.video_id,
title=parsed_response.title,
webpage_url=parsed_response.webpage_url,
webpage_url_domain=parsed_response.webpage_url_domain,
chapters=_merge_short_chapters(parsed_response.chapters or []),
description=parsed_response.description,
)
Expand Down
2 changes: 1 addition & 1 deletion src/yt2doc/transcription/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@ def transcribe(

class ITranscriber(typing.Protocol):
def transcribe(
self, audio_path: Path, video_info: youtube_interfaces.MediaInfo
self, audio_path: Path, media_info: youtube_interfaces.MediaInfo
) -> Transcription: ...
16 changes: 8 additions & 8 deletions src/yt2doc/transcription/transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,18 +69,18 @@ def _clean_title(title: str, punctuations: punctuations.Punctuations) -> str:
def _get_initial_prompt(
self,
language_code: str,
video_info: youtube_interfaces.MediaInfo,
media_info: youtube_interfaces.MediaInfo,
) -> str:
punctuations_ = punctuations.get_punctuations(language_code=language_code)
cleaned_title = self._clean_title(
title=video_info.title,
title=media_info.title,
punctuations=punctuations_,
)
cleaned_video_description = self._clean_video_description(
video_info.description, punctuations=punctuations_
media_info.description, punctuations=punctuations_
)
chapter_titles = f"{punctuations_.comma}".join(
c.title for c in video_info.chapters
c.title for c in media_info.chapters
)
return f"{cleaned_title}{punctuations_.full_stop} {cleaned_video_description} {chapter_titles}"

Expand Down Expand Up @@ -114,14 +114,14 @@ def _convert_audio_to_wav(audio_path: Path) -> Path:
return wav_audio_path

def transcribe(
self, audio_path: Path, video_info: youtube_interfaces.MediaInfo
self, audio_path: Path, media_info: youtube_interfaces.MediaInfo
) -> interfaces.Transcription:
wav_audio_path = self._convert_audio_to_wav(audio_path=audio_path)

language_code = self.whisper_adapter.detect_language(audio_path=wav_audio_path)
initial_prompt = self._get_initial_prompt(
language_code=language_code,
video_info=video_info,
media_info=media_info,
)
logger.info(f"Initial prompt: {initial_prompt}")

Expand All @@ -133,8 +133,8 @@ def transcribe(

rounded_full_audio_duration = round(full_audio_duration, 2)
current_timestamp = 0.0
if len(video_info.chapters) > 0:
chapters = video_info.chapters
if len(media_info.chapters) > 0:
chapters = media_info.chapters
else:
chapters = [
youtube_interfaces.MediaChapter(
Expand Down

0 comments on commit 7af3f09

Please sign in to comment.