Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cache skip videos to json file. #9

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion arvind.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def download_info(self):
print('==> URL ' + self.url + ' does not match YOUTUBE_ID_REGEX')
return False
youtube_id = match.group('youtube_id')
self.uid = youtube_id # video must have id because required to set youtube_id later
if not os.path.isdir(YOUTUBE_CACHE_DIR):
os.mkdir(YOUTUBE_CACHE_DIR)
vinfo_json_path = os.path.join(YOUTUBE_CACHE_DIR, youtube_id+'.json')
Expand Down Expand Up @@ -135,7 +136,6 @@ def download_info(self):
else:
return False

self.uid = vinfo['id'] # video must have id because required to set youtube_id later
self.title = vinfo.get('title', '')
self.description = vinfo.get('description', '')
if not vinfo['license']:
Expand Down
77 changes: 63 additions & 14 deletions sushichef.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
#!/usr/bin/env python

import json
import os
import requests
import re
import shutil
import sys

from arvind import ArvindVideo, ArvindLanguage, YOUTUBE_CACHE_DIR
from arvind import ArvindVideo, ArvindLanguage, YOUTUBE_CACHE_DIR, YOUTUBE_ID_REGEX

from bs4 import BeautifulSoup
from bs4.element import NavigableString
Expand All @@ -27,6 +29,8 @@

SKIP_VIDEOS_PATH = os.path.join(ROOT_DIR_PATH, "skip_videos.txt")

CACHE_SKIP_VIDEOS_PATH = os.path.join(ROOT_DIR_PATH, "cache_skip_videos.json")

# These are the languages that has no sub topics on its videos.
SINGLE_TOPIC_LANGUAGES = [
"bhojpuri; bajpuri; bhojapuri", # actual lang_obj.name in le-utils
Expand All @@ -51,6 +55,8 @@
# List of multiple languages on its topics
MULTI_LANGUAGE_TOPIC = ["russian", "french",]

CACHE_SKIP_VIDEO_LIST = []

# This are the estimate total count of arvind gupta toys language contents
TOTAL_ARVIND_LANG = 23

Expand All @@ -60,6 +66,8 @@

YOUTUBE_DOMAINS = ["youtu.be", "youtube.com"]

# Include this at script argv to delete CACHE_SKIP_VIDEOS_PATH
CLEAR_SKIP_CACHE = "--clear-skip-cache"

DEBUG_MODE = True # Print extra debug info durig the chef run (disable in prod)

Expand Down Expand Up @@ -89,6 +97,28 @@ def clean_video_title(title, lang_obj):
return clean_title


def load_skip_videos():
data = []
if not os.path.exists(CACHE_SKIP_VIDEOS_PATH):
return data
with open(CACHE_SKIP_VIDEOS_PATH) as json_file:
try:
data = json.load(json_file)
if type(data) == type([]) :
return data
except Exception as e:
print("Error failed to load cache video", e)
return data


def cache_skip_videos():
global CACHE_SKIP_VIDEO_LIST
skip_videos = load_skip_videos()
with open(CACHE_SKIP_VIDEOS_PATH, 'w') as outfile:
data = CACHE_SKIP_VIDEO_LIST + skip_videos
json.dump(data, outfile)


def include_video_topic(topic_node, video_data, lang_obj):
# Include video details to the parent topic node
video_id = video_data.uid
Expand Down Expand Up @@ -124,26 +154,35 @@ def download_video_topics(data, topic, topic_node, lang_obj):
"""
Scrape, collect, and download the videos and their thumbnails.
"""
global CACHE_SKIP_VIDEO_LIST
video_source_ids = []
for vinfo in data[topic]:
try:
video_url = vinfo['video_url']
video = ArvindVideo(
url=vinfo['video_url'],
url=video_url,
title=vinfo['video_title'],
language=lang_obj.code)

if video.download_info():
if video.license_common:
video_source_id = 'arvind-video-{0}'.format(video.uid)
if video_source_id not in video_source_ids:
include_video_topic(topic_node, video, lang_obj)
video_source_ids.append(video_source_id)
match = YOUTUBE_ID_REGEX.match(video_url)
if match:
youtube_id = match.group('youtube_id')
skip_videos = load_skip_videos()
if not youtube_id in skip_videos:
if video.download_info():
if video.license_common:
video_source_id = 'arvind-video-{0}'.format(video.uid)
if video_source_id not in video_source_ids:
include_video_topic(topic_node, video, lang_obj)
video_source_ids.append(video_source_id)
else:
print('Skipping duplicate video: ' + str(vinfo['video_url']))
else:
save_skip_videos(video, topic, lang_obj)
CACHE_SKIP_VIDEO_LIST.append(video.uid)
else:
print('Skipping duplicate video: ' + str(vinfo['video_url']))
else:
save_skip_videos(video, topic, lang_obj)
else:
save_skip_videos(video, topic, lang_obj)
save_skip_videos(video, topic, lang_obj)
CACHE_SKIP_VIDEO_LIST.append(video.uid)

except Exception as e:
print('Error downloading this video:', e)
Expand Down Expand Up @@ -321,6 +360,7 @@ def create_language_topic():
language_next_int += 4
loop_couter += 1

cache_skip_videos()
return main_topic_list


Expand Down Expand Up @@ -357,11 +397,20 @@ def construct_channel(self, **kwargs):
return channel


def check_arg():
"""Remove cache skip videos if CLEAR_SKIP_CACHE in argv"""
args_val = sys.argv
if CLEAR_SKIP_CACHE in args_val:
args_val.remove(CLEAR_SKIP_CACHE)
if os.path.exists(CACHE_SKIP_VIDEOS_PATH):
os.remove(CACHE_SKIP_VIDEOS_PATH)


if __name__ == "__main__":
"""
Run this script on the command line using:
python sushichef.py -v --reset --token=YOURTOKENHERE9139139f3a23232
"""

check_arg()
chef = ArvindChef()
chef.main()