Skip to content

Commit

Permalink
Add Playlists to CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
PaulNdrei committed May 6, 2024
1 parent 5329359 commit 54322e3
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 35 deletions.
3 changes: 3 additions & 0 deletions cli/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from cli.add_yt_channel import add_yt_channel
from cli.add_ccma_json import add_ccma_json
from cli.add_yt_video import add_yt_video
from cli.add_yt_playlist import add_yt_playlist


def main():
Expand All @@ -17,6 +18,8 @@ def main():
add_ccma_json(sys.argv[2])
if command == "add-yt-video" and sys.argv[2]:
add_yt_video(sys.argv[2])
if command == "add-yt-playlist" and sys.argv[2]:
add_yt_playlist(sys.argv[2])

if __name__ == "__main__":
main()
18 changes: 18 additions & 0 deletions cli/add_yt_playlist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from pytube import Playlist
from db import get_connection
import traceback

def add_yt_playlist(url):
try:
conn = get_connection()
cur = conn.cursor()

playlist = Playlist(url)
print(f"Importing {len(playlist.video_urls)} videos")
for video_url in playlist.video_urls:
cur.execute("INSERT INTO sources (url, type) VALUES (%s,%s)", (video_url, "youtube"))
conn.commit()
print("Finished importing")
except Exception:
print("Failed to import channel to DB")
traceback.print_exc()
85 changes: 54 additions & 31 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,26 +63,32 @@ services:
image: projecteaina/text2lang:latest
container_name: datapipe-text2lang
restart: unless-stopped
networks:
- text2lang-net
expose:
- "8000"

preprocessor:
image: projecteaina/datapipe:latest
container_name: datapipe-preprocessor
restart: unless-stopped
volumes:
- datapipe-data:/datapipe
command: "python -m preprocessor"
command: "preprocessor"
deploy:
mode: replicated
replicas: 1
environment:
- PYTHONUNBUFFERED=1
- PG_HOST=datapipe-db
- PG_PASSWORD=${POSTGRES_PASSWORD}
- API_URL=http://text2lang:80/text2lang
- API_URL=http://text2lang:8000/text2lang
- SKIP_LICENSE_CHECK=${SKIP_LICENSE_CHECK}
networks:
- db-network
- text2lang-net
depends_on:
datapipe-db:
condition: service_healthy
condition: service_healthy

fetcher:
image: projecteaina/datapipe:latest
Expand Down Expand Up @@ -110,11 +116,13 @@ services:

converter:
image: projecteaina/datapipe:latest
container_name: datapipe-converter
restart: unless-stopped
volumes:
- datapipe-data:/datapipe
command: "python -m converter"
command: "converter"
deploy:
mode: replicated
replicas: 1
environment:
- PYTHONUNBUFFERED=1
- PG_HOST=datapipe-db
Expand All @@ -130,47 +138,54 @@ services:

# vad-transcriber:
# image: projecteaina/datapipe:latest
# container_name: datapipe-vad-transcriber
# restart: unless-stopped
# volumes:
# - datapipe-data:/datapipe
# command: "python -m vosk_vad_transcriber"
# command: "vosk_vad_transcriber"
# deploy:
# mode: replicated
# replicas: 1
# environment:
# - PYTHONUNBUFFERED=1
# - PG_HOST=datapipe-db
# - PG_PASSWORD=${POSTGRES_PASSWORD}
# - VOSK_SERVER_HOST=ona-vosk
# networks:
# - db-network
# - ona-vosk-net
# depends_on:
# datapipe-db:
# condition: service_healthy

gender:
image: projecteaina/datapipe:latest
container_name: datapipe-gender
restart: unless-stopped
volumes:
- datapipe-data:/datapipe
command: "python -m gender"
environment:
- PYTHONUNBUFFERED=1
- PG_HOST=datapipe-db
- PG_PASSWORD=${POSTGRES_PASSWORD}
- CLIPS_PATH= /datapipe/clips
networks:
- db-network
depends_on:
datapipe-db:
condition: service_healthy
# gender:
# image: projecteaina/datapipe:latest
# restart: unless-stopped
# volumes:
# - datapipe-data:/datapipe
# command: "gender"
# deploy:
# mode: replicated
# replicas: 1
# environment:
# - PYTHONUNBUFFERED=1
# - PG_HOST=datapipe-db
# - PG_PASSWORD=${POSTGRES_PASSWORD}
# - CLIPS_PATH= /datapipe/clips
# networks:
# - db-network
# depends_on:
# datapipe-db:
# condition: service_healthy

# splitter:
# image: projecteaina/datapipe:latest
# container_name: datapipe-splitter
# restart: unless-stopped
# volumes:
# - datapipe-data:/datapipe
# command: "python -m splitter"
# command: "splitter"
# deploy:
# mode: replicated
# replicas: 1
# environment:
# - PYTHONUNBUFFERED=1
# - PG_HOST=datapipe-db
Expand All @@ -184,45 +199,53 @@ services:

# ona-vosk:
# image: assistent/kaldi-catala:0.0.4
# container_name: ona-vosk
# restart: unless-stopped
# environment:
# - VOSK_SAMPLE_RATE=16000
# networks:
# - ona-vosk-net
# expose:
# - "5001"

# wav2vec2-catala:
# image: ghcr.io/ccoreilly/wav2vec2-catala-onnx:0.1.1
# container_name: wav2vec2-catala
# restart: unless-stopped
# networks:
# - wav2vec2-net
# expose:
# - "8000"

# wav2vec2-transcriber:
# image: projecteaina/datapipe:latest
# container_name: datapipe-wav2vec2-transcriber
# restart: unless-stopped
# volumes:
# - datapipe-data:/datapipe
# command: "python -m wav2vec2_transcriber"
# command: "wav2vec2_transcriber"
# deploy:
# mode: replicated
# replicas: 1
# environment:
# - PYTHONUNBUFFERED=1
# - PG_HOST=datapipe-db
# - PG_PASSWORD=${POSTGRES_PASSWORD}
# - API_URL=http://wav2vec2-catala/recognize
# networks:
# - db-network
# - wav2vec2-net
# depends_on:
# datapipe-db:
# condition: service_healthy


volumes:
datapipe-data:
datapipe-db-data:
grafana-data:

networks:
# wav2vec2-net:
# ona-vosk-net:
text2lang-net:
db-network:
# driver_opts:
# com.docker.network.driver.mtu: ${NETWORK_MTU}
4 changes: 2 additions & 2 deletions gender/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from os import getenv, path
from os import getenv, path, makedirs
import sys
from joblib import load
from time import sleep
Expand All @@ -20,7 +20,7 @@

if not path.exists(CLIPS_PATH):
print(f"Clips path {CLIPS_PATH} does not exist!")
sys.exit(1)
makedirs(path.dirname(CLIPS_PATH), exist_ok=True)

MFCC_MIN_FREQUENCY = 60
MFCC_MAX_FREQUENCY = 8_000
Expand Down
17 changes: 15 additions & 2 deletions preprocessor/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from os import getenv
from time import sleep
from pytube import YouTube
from urllib.error import HTTPError

import json
Expand All @@ -10,11 +9,22 @@
from db import get_connection
from utils import GracefulKiller

from pytube import YouTube
# from pytube.innertube import _default_clients

# _default_clients["ANDROID"]["context"]["client"]["clientVersion"] = "19.08.35"
# _default_clients["IOS"]["context"]["client"]["clientVersion"] = "19.08.35"
# _default_clients["ANDROID_EMBED"]["context"]["client"]["clientVersion"] = "19.08.35"
# _default_clients["IOS_EMBED"]["context"]["client"]["clientVersion"] = "19.08.35"
# _default_clients["IOS_MUSIC"]["context"]["client"]["clientVersion"] = "6.41"
# _default_clients["ANDROID_MUSIC"] = _default_clients["ANDROID_CREATOR"]

killer = GracefulKiller()

API_TOKEN = getenv("API_TOKEN")
API_URL = getenv("API_URL", "https://api-inference.huggingface.co/models/ivanlau/language-detection-fine-tuned-on-xlm-roberta-base")
headers = {"Authorization": f"Bearer {API_TOKEN}"}
SKIP_LICENSE_CHECK = getenv("SKIP_LICENSE_CHECK", 'False').lower() in ('true', '1', 't')

youtube_wait = 5

Expand Down Expand Up @@ -94,8 +104,11 @@ def youtube_license_check(yt):
try:
yt = get_youtube(source_id, url)
new_status = "ready_for_download" if youtube_language_check(yt) else "bad_language"
if new_status == "bad_language":
print(f"Bad language: {url}")
license = "CC-BY" if youtube_license_check(yt) else "PROP"
if license == "PROP":
if license == "PROP" and not SKIP_LICENSE_CHECK:
print(f"Bad licence (not CC-BY): {url} ")
new_status = "bad_license"
captions = 'ca' in yt.captions
cur.execute(f"UPDATE sources SET status='{new_status}', license='{license}', has_captions='{captions}', status_update=now() WHERE source_id = '{source_id}'")
Expand Down

0 comments on commit 54322e3

Please sign in to comment.