From 9f1ede6e6d860dc7f749f6adb86bbd256eb2f168 Mon Sep 17 00:00:00 2001 From: Shak Ragoler <58454820+Shak2000@users.noreply.github.com> Date: Sat, 5 Mar 2022 20:24:02 -0800 Subject: [PATCH] feature/downloading-youtube-videos (#169) * Fixing documentation * Adding thumbnail compression * Fixing a small issue related to imports * Removing duplicate code from the thumbnail compression methods and replacing the large video file * Moving imports * make it possible to downloaded youtube videos * formatting the youtube downloading functions and tests * adding the conftest * fixing comments * fixing more comments * fixing some more comments --- cdp_backend/tests/conftest.py | 3 ++ cdp_backend/tests/utils/test_file_utils.py | 34 +++++++++++++++++ cdp_backend/utils/file_utils.py | 43 +++++++++++++++++++++- setup.py | 1 + 4 files changed, 80 insertions(+), 1 deletion(-) diff --git a/cdp_backend/tests/conftest.py b/cdp_backend/tests/conftest.py index 0fb0825d..08faa358 100644 --- a/cdp_backend/tests/conftest.py +++ b/cdp_backend/tests/conftest.py @@ -29,6 +29,9 @@ def resources_dir() -> Path: EXAMPLE_VIDEO_FILENAME = "example_video.mp4" EXAMPLE_MKV_VIDEO_FILENAME = "example_video.mkv" EXAMPLE_VIDEO_HD_FILENAME = "example_video_large.mp4" +EXAMPLE_YOUTUBE_VIDEO_EMBEDDED = "https://www.youtube.com/embed/XALBGkjkUPQ" +EXAMPLE_YOUTUBE_VIDEO_PARAMETER = "https://www.youtube.com/watch?v=XALBGkjkUPQ" +EXAMPLE_YOUTUBE_VIDEO_SHORT = "https://youtu.be/watch?v=XALBGkjkUPQ" @pytest.fixture diff --git a/cdp_backend/tests/utils/test_file_utils.py b/cdp_backend/tests/utils/test_file_utils.py index 8fbee5d4..6721a2e8 100644 --- a/cdp_backend/tests/utils/test_file_utils.py +++ b/cdp_backend/tests/utils/test_file_utils.py @@ -19,10 +19,14 @@ resource_copy, ) +from .. import test_utils from ..conftest import ( EXAMPLE_MKV_VIDEO_FILENAME, EXAMPLE_VIDEO_FILENAME, EXAMPLE_VIDEO_HD_FILENAME, + EXAMPLE_YOUTUBE_VIDEO_EMBEDDED, + EXAMPLE_YOUTUBE_VIDEO_PARAMETER, + EXAMPLE_YOUTUBE_VIDEO_SHORT, ) ############################################################################# @@ -211,6 +215,10 @@ def test_hover_thumbnail_generator( os.remove(result) +@pytest.mark.skipif( + not test_utils.internet_is_available(), + reason="No internet connection", +) @pytest.mark.parametrize( "video_uri, expected", [ @@ -224,3 +232,29 @@ def test_convert_video_to_mp4( ) -> None: filepath = str(resources_dir / video_uri) assert file_utils.convert_video_to_mp4(filepath) == str(resources_dir / expected) + + +@pytest.mark.skipif( + not test_utils.internet_is_available(), + reason="No internet connection", +) +@pytest.mark.parametrize( + "youtube_uri, expected", + [ + (EXAMPLE_YOUTUBE_VIDEO_EMBEDDED, "XALBGkjkUPQ.mp4"), + (EXAMPLE_YOUTUBE_VIDEO_PARAMETER, "XALBGkjkUPQ.mp4"), + (EXAMPLE_YOUTUBE_VIDEO_SHORT, "XALBGkjkUPQ.mp4"), + ], +) +def test_youtube_downloader( + resources_dir: Path, + youtube_uri: str, + expected: str, +) -> None: + actual_uri = file_utils.resource_copy(youtube_uri, resources_dir, True) + expected_uri = str(resources_dir / expected) + assert actual_uri == expected_uri + assert Path(actual_uri).exists() + assert Path(actual_uri).is_file() + + os.remove(actual_uri) diff --git a/cdp_backend/utils/file_utils.py b/cdp_backend/utils/file_utils.py index 3c2e44cc..1f106380 100644 --- a/cdp_backend/utils/file_utils.py +++ b/cdp_backend/utils/file_utils.py @@ -11,6 +11,7 @@ import aiohttp import fsspec from fsspec.core import url_to_fs +from yt_dlp import YoutubeDL ############################################################################### @@ -90,7 +91,12 @@ def resource_copy( # Ensure dst doesn't exist dst = Path(dst).resolve() if dst.is_dir(): - dst = dst / uri.split("/")[-1] + if "v=" in str(uri): + # Split by youtube video query parameter + dst = dst / uri.split("v=")[-1] + else: + # Split by the last "/" + dst = dst / uri.split("/")[-1] # Ensure filename is less than 255 chars # Otherwise this can raise an OSError for too long of a filename @@ -105,6 +111,9 @@ def resource_copy( log.info(f"Beginning resource copy from: {uri}") # Get file system try: + if uri.find("youtube.com") >= 0 or uri.find("youtu.be") >= 0: + return youtube_copy(uri, dst, overwrite) + kwargs = {} # Set custom timeout for http resources @@ -126,6 +135,38 @@ def resource_copy( raise e +def youtube_copy(uri: str, dst: Path, overwrite: bool = False) -> str: + """ + Copy a video from YouTube to a local destination on the machine. + + Parameters + ---------- + uri: str + The url of the YouTube video to copy. + dst: str + The location of the file to download. + overwrite: bool + Boolean value indicating whether or not to overwrite a local video with + the same name if it already exists. + + Returns + _______ + dst: str + The location of the downloaded file. + """ + # dst = Path(str(dst) + ".mp4") + dst = dst.with_suffix(".mp4") + + # Ensure dest isn't a file + if dst.is_file() and not overwrite: + raise FileExistsError(dst) + + ydl_opts = {"outtmpl": str(dst), "format": "mp4"} + with YoutubeDL(ydl_opts) as ydl: + ydl.download([uri]) + return str(dst) + + def split_audio( video_read_path: str, audio_save_path: str, diff --git a/setup.py b/setup.py index 8fe959f2..5a29d5be 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ "spacy~=3.0", "truecase~=0.0.12", "webvtt-py~=0.4.6", + "yt-dlp~=2022.2.4" ] test_requirements = [