From 0f446f6e94433108e2041e16cb1a072d91945556 Mon Sep 17 00:00:00 2001 From: Joe Martinez Date: Tue, 25 Jun 2024 20:44:17 -0500 Subject: [PATCH] using yt-dlp package instead of subprocess --- pyproject.toml | 3 +- requirements.txt | 3 +- yt_fts/download.py | 151 ++++++++++++++++++++------------------------- yt_fts/yt_fts.py | 15 +++-- 4 files changed, 82 insertions(+), 90 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 87d10eb..7b51e96 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,8 @@ dependencies = [ "requests==2.31.0", "rich==13.7.1", "sqlite-utils==3.36", - "beautifulsoup4==4.12.3" + "beautifulsoup4==4.12.3", + "yt-dlp==2024.5.27" ] [project.scripts] diff --git a/requirements.txt b/requirements.txt index 1e154fd..d2e0ffd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ chromadb==0.5.2 requests==2.31.0 rich==13.7.1 sqlite-utils==3.36 -beautifulsoup4==4.12.3 \ No newline at end of file +beautifulsoup4==4.12.3 +yt-dlp==2024.5.27 \ No newline at end of file diff --git a/yt_fts/download.py b/yt_fts/download.py index 4124155..4c69980 100644 --- a/yt_fts/download.py +++ b/yt_fts/download.py @@ -1,6 +1,11 @@ +import yt_dlp import tempfile -import subprocess, re, os, sqlite3, json +import re +import os +import sqlite3 +import json +from pathlib import Path from concurrent.futures import ThreadPoolExecutor from bs4 import BeautifulSoup from urllib.parse import urlparse @@ -12,6 +17,7 @@ from rich.progress import track from rich.console import Console +console = Console() def handle_reject_consent_cookie(channel_url, s): """ @@ -82,80 +88,60 @@ def get_videos_list(channel_url): console = Console() with console.status("[bold green]Scraping video urls, this might take a little...") as status: - cmd = [ - "yt-dlp", - "--flat-playlist", - "--print", - "id", - f"{channel_url}" - ] - res = subprocess.run(cmd, capture_output=True, check=True) - list_of_videos_urls = res.stdout.decode().splitlines() - - streams_url = channel_url.replace("/videos", "/streams") - cmd = [ - "yt-dlp", - "--flat-playlist", - "--print", - "id", - streams_url - ] - try: - res = subprocess.run(cmd, capture_output=True, check=True) - live_stream_urls = res.stdout.decode().splitlines() - if len(live_stream_urls) > 0: - list_of_videos_urls.extend(live_stream_urls) - except subprocess.CalledProcessError: - console.print("[bold red]No streams tab found or error fetching streams.") + ydl_opts = { + 'extract_flat': True, + 'quiet': True, + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(channel_url, download=False) + list_of_videos_urls = [entry['id'] for entry in info['entries']] + streams_url = channel_url.replace("/videos", "/streams") + try: + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + streams_info = ydl.extract_info(streams_url, download=False) + live_stream_urls = [entry['id'] for entry in streams_info['entries']] + if len(live_stream_urls) > 0: + list_of_videos_urls.extend(live_stream_urls) + except Exception: + console.print("[bold red]No streams found") - return list_of_videos_urls + return list_of_videos_urls def get_playlist_data(playlist_url): """ Returns a list of channel ids and video ids from a playlist - [ - ['channel_id', 'video_id'], - ] """ - console = Console() with console.status("[bold green]Scraping video urls, this might take a little...") as status: - cmd = [ - "yt-dlp", - "--print", - "%(channel)s,%(channel_id)s,%(id)s", - f"{playlist_url}" - ] - res = subprocess.run(cmd, capture_output=True, check=True) - data = res.stdout.decode().splitlines() - - playlist_data = [] - - for vid in data: - vid = vid.split(',') - vid_obj = { - 'channel_name': vid[0], - 'channel_id': vid[1], - 'video_id': vid[2], - 'channel_url': f"https://www.youtube.com/channel/{vid[1]}/videos", - 'video_url': f"https://youtu.be/{vid[2]}" - } - playlist_data.append(vid_obj) - - return playlist_data - - -def download_vtts(number_of_jobs, video_ids, language ,tmp_dir): + ydl_opts = { + 'quiet': True, + 'extract_flat': True, + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(playlist_url, download=False) + playlist_data = [] + for entry in info['entries']: + vid_obj = { + 'channel_name': entry['channel'], + 'channel_id': entry['channel_id'], + 'video_id': entry['id'], + 'channel_url': f"https://www.youtube.com/channel/{entry['channel_id']}/videos", + 'video_url': f"https://youtu.be/{entry['id']}" + } + playlist_data.append(vid_obj) + + return playlist_data + + +def download_vtts(number_of_jobs, video_ids, language, tmp_dir): """ Multi-threaded download of vtt files """ - - # showing progress on a multi-threaded task might be more trouble than it's worth - # console = Console() - executor = ThreadPoolExecutor(number_of_jobs) futures = [] @@ -168,17 +154,26 @@ def download_vtts(number_of_jobs, video_ids, language ,tmp_dir): futures[i].result() +def quiet_progress_hook(d): + if d['status'] == 'finished': + filename = Path(d['filename']).name + print(f" -> {filename}") + + def get_vtt(tmp_dir, video_url, language): - subprocess.run([ - "yt-dlp", - "-o", f"{tmp_dir}/%(id)s", - "--write-info-json", - "--write-auto-sub", - "--convert-subs", "vtt", - "--skip-download", - "--sub-langs", f"{language},-live_chat", - video_url - ]) + ydl_opts = { + 'outtmpl': f'{tmp_dir}/%(id)s', + 'writeinfojson': True, + 'writeautomaticsub': True, + 'subtitlesformat': 'vtt', + 'skip_download': True, + 'subtitleslangs': [language, '-live_chat'], + 'quiet': True, + 'progress_hooks': [quiet_progress_hook] + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + ydl.download([video_url]) def vtt_to_db(dir_path): @@ -187,21 +182,11 @@ def vtt_to_db(dir_path): the vtt parsing function, then inserts the data into the database. """ items = os.listdir(dir_path) - file_paths = [] - - for item in items: - # ignore other files e.g. info.json files - if not item.endswith('.vtt'): - continue - - item_path = os.path.join(dir_path, item) - if os.path.isfile(item_path): - file_paths.append(item_path) + file_paths = [os.path.join(dir_path, item) for item in items if item.endswith('.vtt')] con = sqlite3.connect(get_db_path()) cur = con.cursor() - for vtt in track(file_paths, description="Adding subtitles to database..."): base_name = os.path.basename(vtt) @@ -213,7 +198,7 @@ def vtt_to_db(dir_path): with open(vid_json_path, 'r', encoding='utf-8', errors='ignore') as f: vid_json = json.load(f) - vid_title = vid_json['title'] + vid_title = vid_json['title'] vid_date = get_date(vid_json['upload_date']) channel_id = vid_json['channel_id'] diff --git a/yt_fts/yt_fts.py b/yt_fts/yt_fts.py index 156d27c..2758b8f 100644 --- a/yt_fts/yt_fts.py +++ b/yt_fts/yt_fts.py @@ -1,4 +1,5 @@ import click +import sys import requests from .config import get_config_path, get_db_path, get_or_make_chroma_path @@ -10,6 +11,7 @@ from rich.console import Console YT_FTS_VERSION = "0.1.49" +console = Console() @click.group(context_settings={"help_option_names": ["-h", "--help"]}) @click.version_option(YT_FTS_VERSION, message='yt_fts version: %(version)s') @@ -32,13 +34,16 @@ def cli(): @click.option("-j", "--number-of-jobs", type=int, default=1, help="Optional number of jobs to parallelize the run") def download(url, playlist, language, number_of_jobs): - console = Console() s = requests.session() handle_reject_consent_cookie(url, s) if playlist == True: + if "playlist?" not in url: + console.print(f"\n[bold red]Error:[/bold red] Invalid playlist url {url}") + print("\nYouTube playlists have this format: https://www.youtube.com/playlist?list=\n") + sys.exit(1) download_playlist(url, s, language, number_of_jobs) - return + sys.exit(0) # find out if the channel exists on the internet with console.status("[bold green]Getting Channel ID...") as status: @@ -141,9 +146,9 @@ def delete(channel): channel_name = get_channel_name_from_id(channel_id) channel_url = f"https://www.youtube.com/channel/{channel_id}/videos" - print(f"Deleting channel {channel_name}: {channel_url}") - print("Are you sure you want to delete this channel and all its data?") - confirm = input("y/n: ") + console.print(f"Deleting channel [bold]\"{channel_name}\"[/bold]: {channel_url}") + console.print("[bold]Are you sure you want to delete this channel and all its data?[/bold]") + confirm = input("(Y/n): ") if confirm == "y": delete_channel(channel_id)