From 54322e3a1265d36dd39889eab9c0ab7ea782a991 Mon Sep 17 00:00:00 2001 From: PaulNdrei Date: Mon, 6 May 2024 21:28:25 +0200 Subject: [PATCH] Add Playlists to CLI --- cli/__main__.py | 3 ++ cli/add_yt_playlist.py | 18 +++++++++ docker-compose.yml | 85 +++++++++++++++++++++++++--------------- gender/__main__.py | 4 +- preprocessor/__main__.py | 17 +++++++- 5 files changed, 92 insertions(+), 35 deletions(-) create mode 100644 cli/add_yt_playlist.py diff --git a/cli/__main__.py b/cli/__main__.py index 2977ad2..e334c96 100644 --- a/cli/__main__.py +++ b/cli/__main__.py @@ -3,6 +3,7 @@ from cli.add_yt_channel import add_yt_channel from cli.add_ccma_json import add_ccma_json from cli.add_yt_video import add_yt_video +from cli.add_yt_playlist import add_yt_playlist def main(): @@ -17,6 +18,8 @@ def main(): add_ccma_json(sys.argv[2]) if command == "add-yt-video" and sys.argv[2]: add_yt_video(sys.argv[2]) + if command == "add-yt-playlist" and sys.argv[2]: + add_yt_playlist(sys.argv[2]) if __name__ == "__main__": main() diff --git a/cli/add_yt_playlist.py b/cli/add_yt_playlist.py new file mode 100644 index 0000000..4f7c429 --- /dev/null +++ b/cli/add_yt_playlist.py @@ -0,0 +1,18 @@ +from pytube import Playlist +from db import get_connection +import traceback + +def add_yt_playlist(url): + try: + conn = get_connection() + cur = conn.cursor() + + playlist = Playlist(url) + print(f"Importing {len(playlist.video_urls)} videos") + for video_url in playlist.video_urls: + cur.execute("INSERT INTO sources (url, type) VALUES (%s,%s)", (video_url, "youtube")) + conn.commit() + print("Finished importing") + except Exception: + print("Failed to import channel to DB") + traceback.print_exc() \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 9ae64a0..e1466b7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -63,26 +63,32 @@ services: image: projecteaina/text2lang:latest container_name: datapipe-text2lang restart: unless-stopped + networks: + - text2lang-net expose: - "8000" preprocessor: image: projecteaina/datapipe:latest - container_name: datapipe-preprocessor restart: unless-stopped volumes: - datapipe-data:/datapipe - command: "python -m preprocessor" + command: "preprocessor" + deploy: + mode: replicated + replicas: 1 environment: - PYTHONUNBUFFERED=1 - PG_HOST=datapipe-db - PG_PASSWORD=${POSTGRES_PASSWORD} - - API_URL=http://text2lang:80/text2lang + - API_URL=http://text2lang:8000/text2lang + - SKIP_LICENSE_CHECK=${SKIP_LICENSE_CHECK} networks: - db-network + - text2lang-net depends_on: datapipe-db: - condition: service_healthy + condition: service_healthy fetcher: image: projecteaina/datapipe:latest @@ -110,11 +116,13 @@ services: converter: image: projecteaina/datapipe:latest - container_name: datapipe-converter restart: unless-stopped volumes: - datapipe-data:/datapipe - command: "python -m converter" + command: "converter" + deploy: + mode: replicated + replicas: 1 environment: - PYTHONUNBUFFERED=1 - PG_HOST=datapipe-db @@ -130,11 +138,13 @@ services: # vad-transcriber: # image: projecteaina/datapipe:latest - # container_name: datapipe-vad-transcriber # restart: unless-stopped # volumes: # - datapipe-data:/datapipe - # command: "python -m vosk_vad_transcriber" + # command: "vosk_vad_transcriber" + # deploy: + # mode: replicated + # replicas: 1 # environment: # - PYTHONUNBUFFERED=1 # - PG_HOST=datapipe-db @@ -142,35 +152,40 @@ services: # - VOSK_SERVER_HOST=ona-vosk # networks: # - db-network + # - ona-vosk-net # depends_on: # datapipe-db: # condition: service_healthy - gender: - image: projecteaina/datapipe:latest - container_name: datapipe-gender - restart: unless-stopped - volumes: - - datapipe-data:/datapipe - command: "python -m gender" - environment: - - PYTHONUNBUFFERED=1 - - PG_HOST=datapipe-db - - PG_PASSWORD=${POSTGRES_PASSWORD} - - CLIPS_PATH= /datapipe/clips - networks: - - db-network - depends_on: - datapipe-db: - condition: service_healthy + # gender: + # image: projecteaina/datapipe:latest + # restart: unless-stopped + # volumes: + # - datapipe-data:/datapipe + # command: "gender" + # deploy: + # mode: replicated + # replicas: 1 + # environment: + # - PYTHONUNBUFFERED=1 + # - PG_HOST=datapipe-db + # - PG_PASSWORD=${POSTGRES_PASSWORD} + # - CLIPS_PATH= /datapipe/clips + # networks: + # - db-network + # depends_on: + # datapipe-db: + # condition: service_healthy # splitter: # image: projecteaina/datapipe:latest - # container_name: datapipe-splitter # restart: unless-stopped # volumes: # - datapipe-data:/datapipe - # command: "python -m splitter" + # command: "splitter" + # deploy: + # mode: replicated + # replicas: 1 # environment: # - PYTHONUNBUFFERED=1 # - PG_HOST=datapipe-db @@ -184,10 +199,11 @@ services: # ona-vosk: # image: assistent/kaldi-catala:0.0.4 - # container_name: ona-vosk # restart: unless-stopped # environment: # - VOSK_SAMPLE_RATE=16000 + # networks: + # - ona-vosk-net # expose: # - "5001" @@ -195,16 +211,20 @@ services: # image: ghcr.io/ccoreilly/wav2vec2-catala-onnx:0.1.1 # container_name: wav2vec2-catala # restart: unless-stopped + # networks: + # - wav2vec2-net # expose: # - "8000" # wav2vec2-transcriber: # image: projecteaina/datapipe:latest - # container_name: datapipe-wav2vec2-transcriber # restart: unless-stopped # volumes: # - datapipe-data:/datapipe - # command: "python -m wav2vec2_transcriber" + # command: "wav2vec2_transcriber" + # deploy: + # mode: replicated + # replicas: 1 # environment: # - PYTHONUNBUFFERED=1 # - PG_HOST=datapipe-db @@ -212,17 +232,20 @@ services: # - API_URL=http://wav2vec2-catala/recognize # networks: # - db-network + # - wav2vec2-net # depends_on: # datapipe-db: # condition: service_healthy - volumes: datapipe-data: datapipe-db-data: grafana-data: networks: + # wav2vec2-net: + # ona-vosk-net: + text2lang-net: db-network: # driver_opts: # com.docker.network.driver.mtu: ${NETWORK_MTU} diff --git a/gender/__main__.py b/gender/__main__.py index 135d9ec..81fd3b3 100644 --- a/gender/__main__.py +++ b/gender/__main__.py @@ -1,4 +1,4 @@ -from os import getenv, path +from os import getenv, path, makedirs import sys from joblib import load from time import sleep @@ -20,7 +20,7 @@ if not path.exists(CLIPS_PATH): print(f"Clips path {CLIPS_PATH} does not exist!") - sys.exit(1) + makedirs(path.dirname(CLIPS_PATH), exist_ok=True) MFCC_MIN_FREQUENCY = 60 MFCC_MAX_FREQUENCY = 8_000 diff --git a/preprocessor/__main__.py b/preprocessor/__main__.py index 6e38219..26ee846 100644 --- a/preprocessor/__main__.py +++ b/preprocessor/__main__.py @@ -1,6 +1,5 @@ from os import getenv from time import sleep -from pytube import YouTube from urllib.error import HTTPError import json @@ -10,11 +9,22 @@ from db import get_connection from utils import GracefulKiller +from pytube import YouTube +# from pytube.innertube import _default_clients + +# _default_clients["ANDROID"]["context"]["client"]["clientVersion"] = "19.08.35" +# _default_clients["IOS"]["context"]["client"]["clientVersion"] = "19.08.35" +# _default_clients["ANDROID_EMBED"]["context"]["client"]["clientVersion"] = "19.08.35" +# _default_clients["IOS_EMBED"]["context"]["client"]["clientVersion"] = "19.08.35" +# _default_clients["IOS_MUSIC"]["context"]["client"]["clientVersion"] = "6.41" +# _default_clients["ANDROID_MUSIC"] = _default_clients["ANDROID_CREATOR"] + killer = GracefulKiller() API_TOKEN = getenv("API_TOKEN") API_URL = getenv("API_URL", "https://api-inference.huggingface.co/models/ivanlau/language-detection-fine-tuned-on-xlm-roberta-base") headers = {"Authorization": f"Bearer {API_TOKEN}"} +SKIP_LICENSE_CHECK = getenv("SKIP_LICENSE_CHECK", 'False').lower() in ('true', '1', 't') youtube_wait = 5 @@ -94,8 +104,11 @@ def youtube_license_check(yt): try: yt = get_youtube(source_id, url) new_status = "ready_for_download" if youtube_language_check(yt) else "bad_language" + if new_status == "bad_language": + print(f"Bad language: {url}") license = "CC-BY" if youtube_license_check(yt) else "PROP" - if license == "PROP": + if license == "PROP" and not SKIP_LICENSE_CHECK: + print(f"Bad licence (not CC-BY): {url} ") new_status = "bad_license" captions = 'ca' in yt.captions cur.execute(f"UPDATE sources SET status='{new_status}', license='{license}', has_captions='{captions}', status_update=now() WHERE source_id = '{source_id}'")