Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

using yt-dlp package instead of subprocess #147

Merged
merged 1 commit into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ dependencies = [
"requests==2.31.0",
"rich==13.7.1",
"sqlite-utils==3.36",
"beautifulsoup4==4.12.3"
"beautifulsoup4==4.12.3",
"yt-dlp==2024.5.27"
]

[project.scripts]
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ chromadb==0.5.2
requests==2.31.0
rich==13.7.1
sqlite-utils==3.36
beautifulsoup4==4.12.3
beautifulsoup4==4.12.3
yt-dlp==2024.5.27
151 changes: 68 additions & 83 deletions yt_fts/download.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import yt_dlp
import tempfile
import subprocess, re, os, sqlite3, json
import re
import os
import sqlite3
import json

from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
from urllib.parse import urlparse
Expand All @@ -12,6 +17,7 @@

from rich.progress import track
from rich.console import Console
console = Console()

def handle_reject_consent_cookie(channel_url, s):
"""
Expand Down Expand Up @@ -82,80 +88,60 @@ def get_videos_list(channel_url):
console = Console()

with console.status("[bold green]Scraping video urls, this might take a little...") as status:
cmd = [
"yt-dlp",
"--flat-playlist",
"--print",
"id",
f"{channel_url}"
]
res = subprocess.run(cmd, capture_output=True, check=True)
list_of_videos_urls = res.stdout.decode().splitlines()

streams_url = channel_url.replace("/videos", "/streams")
cmd = [
"yt-dlp",
"--flat-playlist",
"--print",
"id",
streams_url
]
try:
res = subprocess.run(cmd, capture_output=True, check=True)
live_stream_urls = res.stdout.decode().splitlines()
if len(live_stream_urls) > 0:
list_of_videos_urls.extend(live_stream_urls)
except subprocess.CalledProcessError:
console.print("[bold red]No streams tab found or error fetching streams.")
ydl_opts = {
'extract_flat': True,
'quiet': True,
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(channel_url, download=False)
list_of_videos_urls = [entry['id'] for entry in info['entries']]

streams_url = channel_url.replace("/videos", "/streams")
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
streams_info = ydl.extract_info(streams_url, download=False)
live_stream_urls = [entry['id'] for entry in streams_info['entries']]
if len(live_stream_urls) > 0:
list_of_videos_urls.extend(live_stream_urls)
except Exception:
console.print("[bold red]No streams found")

return list_of_videos_urls
return list_of_videos_urls


def get_playlist_data(playlist_url):
"""
Returns a list of channel ids and video ids from a playlist
[
['channel_id', 'video_id'],
]
"""

console = Console()

with console.status("[bold green]Scraping video urls, this might take a little...") as status:
cmd = [
"yt-dlp",
"--print",
"%(channel)s,%(channel_id)s,%(id)s",
f"{playlist_url}"
]
res = subprocess.run(cmd, capture_output=True, check=True)
data = res.stdout.decode().splitlines()

playlist_data = []

for vid in data:
vid = vid.split(',')
vid_obj = {
'channel_name': vid[0],
'channel_id': vid[1],
'video_id': vid[2],
'channel_url': f"https://www.youtube.com/channel/{vid[1]}/videos",
'video_url': f"https://youtu.be/{vid[2]}"
}
playlist_data.append(vid_obj)

return playlist_data


def download_vtts(number_of_jobs, video_ids, language ,tmp_dir):
ydl_opts = {
'quiet': True,
'extract_flat': True,
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(playlist_url, download=False)
playlist_data = []
for entry in info['entries']:
vid_obj = {
'channel_name': entry['channel'],
'channel_id': entry['channel_id'],
'video_id': entry['id'],
'channel_url': f"https://www.youtube.com/channel/{entry['channel_id']}/videos",
'video_url': f"https://youtu.be/{entry['id']}"
}
playlist_data.append(vid_obj)

return playlist_data


def download_vtts(number_of_jobs, video_ids, language, tmp_dir):
"""
Multi-threaded download of vtt files
"""

# showing progress on a multi-threaded task might be more trouble than it's worth
# console = Console()

executor = ThreadPoolExecutor(number_of_jobs)
futures = []

Expand All @@ -168,17 +154,26 @@ def download_vtts(number_of_jobs, video_ids, language ,tmp_dir):
futures[i].result()


def quiet_progress_hook(d):
if d['status'] == 'finished':
filename = Path(d['filename']).name
print(f" -> {filename}")


def get_vtt(tmp_dir, video_url, language):
subprocess.run([
"yt-dlp",
"-o", f"{tmp_dir}/%(id)s",
"--write-info-json",
"--write-auto-sub",
"--convert-subs", "vtt",
"--skip-download",
"--sub-langs", f"{language},-live_chat",
video_url
])
ydl_opts = {
'outtmpl': f'{tmp_dir}/%(id)s',
'writeinfojson': True,
'writeautomaticsub': True,
'subtitlesformat': 'vtt',
'skip_download': True,
'subtitleslangs': [language, '-live_chat'],
'quiet': True,
'progress_hooks': [quiet_progress_hook]
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])


def vtt_to_db(dir_path):
Expand All @@ -187,21 +182,11 @@ def vtt_to_db(dir_path):
the vtt parsing function, then inserts the data into the database.
"""
items = os.listdir(dir_path)
file_paths = []

for item in items:
# ignore other files e.g. info.json files
if not item.endswith('.vtt'):
continue

item_path = os.path.join(dir_path, item)
if os.path.isfile(item_path):
file_paths.append(item_path)
file_paths = [os.path.join(dir_path, item) for item in items if item.endswith('.vtt')]

con = sqlite3.connect(get_db_path())
cur = con.cursor()


for vtt in track(file_paths, description="Adding subtitles to database..."):
base_name = os.path.basename(vtt)

Expand All @@ -213,7 +198,7 @@ def vtt_to_db(dir_path):
with open(vid_json_path, 'r', encoding='utf-8', errors='ignore') as f:
vid_json = json.load(f)

vid_title = vid_json['title']
vid_title = vid_json['title']
vid_date = get_date(vid_json['upload_date'])
channel_id = vid_json['channel_id']

Expand Down
15 changes: 10 additions & 5 deletions yt_fts/yt_fts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import click
import sys
import requests

from .config import get_config_path, get_db_path, get_or_make_chroma_path
Expand All @@ -10,6 +11,7 @@
from rich.console import Console

YT_FTS_VERSION = "0.1.49"
console = Console()

@click.group(context_settings={"help_option_names": ["-h", "--help"]})
@click.version_option(YT_FTS_VERSION, message='yt_fts version: %(version)s')
Expand All @@ -32,13 +34,16 @@ def cli():
@click.option("-j", "--number-of-jobs", type=int, default=1, help="Optional number of jobs to parallelize the run")
def download(url, playlist, language, number_of_jobs):

console = Console()
s = requests.session()
handle_reject_consent_cookie(url, s)

if playlist == True:
if "playlist?" not in url:
console.print(f"\n[bold red]Error:[/bold red] Invalid playlist url {url}")
print("\nYouTube playlists have this format: https://www.youtube.com/playlist?list=<playlist_id>\n")
sys.exit(1)
download_playlist(url, s, language, number_of_jobs)
return
sys.exit(0)

# find out if the channel exists on the internet
with console.status("[bold green]Getting Channel ID...") as status:
Expand Down Expand Up @@ -141,9 +146,9 @@ def delete(channel):
channel_name = get_channel_name_from_id(channel_id)
channel_url = f"https://www.youtube.com/channel/{channel_id}/videos"

print(f"Deleting channel {channel_name}: {channel_url}")
print("Are you sure you want to delete this channel and all its data?")
confirm = input("y/n: ")
console.print(f"Deleting channel [bold]\"{channel_name}\"[/bold]: {channel_url}")
console.print("[bold]Are you sure you want to delete this channel and all its data?[/bold]")
confirm = input("(Y/n): ")

if confirm == "y":
delete_channel(channel_id)
Expand Down
Loading