From 0ff6feea6efb40629b49ca4364b373f8f7a2a5ed Mon Sep 17 00:00:00 2001 From: Richard Amodia Date: Wed, 20 May 2020 23:22:33 +0800 Subject: [PATCH 1/2] Cache skip videos to json file. --- arvind.py | 2 +- sushichef.py | 58 ++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/arvind.py b/arvind.py index efbee0e..0fa88ee 100644 --- a/arvind.py +++ b/arvind.py @@ -100,6 +100,7 @@ def download_info(self): print('==> URL ' + self.url + ' does not match YOUTUBE_ID_REGEX') return False youtube_id = match.group('youtube_id') + self.uid = youtube_id # video must have id because required to set youtube_id later if not os.path.isdir(YOUTUBE_CACHE_DIR): os.mkdir(YOUTUBE_CACHE_DIR) vinfo_json_path = os.path.join(YOUTUBE_CACHE_DIR, youtube_id+'.json') @@ -131,7 +132,6 @@ def download_info(self): else: return False - self.uid = vinfo['id'] # video must have id because required to set youtube_id later self.title = vinfo.get('title', '') self.description = vinfo.get('description', '') if not vinfo['license']: diff --git a/sushichef.py b/sushichef.py index f8772f6..66ce828 100755 --- a/sushichef.py +++ b/sushichef.py @@ -5,8 +5,9 @@ import requests import re import shutil +import json -from arvind import ArvindVideo, ArvindLanguage, YOUTUBE_CACHE_DIR +from arvind import ArvindVideo, ArvindLanguage, YOUTUBE_CACHE_DIR, YOUTUBE_ID_REGEX from bs4 import BeautifulSoup @@ -28,6 +29,8 @@ SKIP_VIDEOS_PATH = os.path.join(ROOT_DIR_PATH, "skip_videos.txt") +CACHE_VIDEOS_PATH = os.path.join(ROOT_DIR_PATH, "cache_skip_videos.json") + # These are the languages that has no sub topics on its videos. SINGLE_TOPIC_LANGUAGES = [ "bhojpuri", "nepali", "malayalam", "telugu", "bengali", \ @@ -38,6 +41,8 @@ # List of multiple languages on its topics MULTI_LANGUAGE_TOPIC = ["russian", "french",] +CACHE_VIDEO_LIST = [] + # This are the estimate total count of arvind gupta toys language contents TOTAL_ARVIND_LANG = 23 @@ -101,27 +106,58 @@ def save_skip_videos(video, topic, lang_obj): text_file.close() +def load_skip_videos(): + data = [] + if not os.path.exists(CACHE_VIDEOS_PATH): + return data + with open(CACHE_VIDEOS_PATH) as json_file: + try: + data = json.load(json_file) + if type(data) == type([]) : + return data + except: + print("Failed to load cache video list") + return data + + +def cache_skip_videos(): + global CACHE_VIDEO_LIST + skip_videos = load_skip_videos() + with open(CACHE_VIDEOS_PATH, 'w') as outfile: + data = CACHE_VIDEO_LIST + skip_videos + json.dump(data, outfile) + + def download_video_topics(data, topic, topic_node, lang_obj): """ Scrape, collect, and download the videos and their thumbnails. """ + global CACHE_VIDEO_LIST pp = pprint.PrettyPrinter() - topic_limit = 0 + for vinfo in data[topic]: try: + video_url = vinfo['video_url'] video = ArvindVideo( - url=vinfo['video_url'], + url=video_url, title=vinfo['video_title'], language=lang_obj.code) - if video.download_info(): - - if video.license_common: - include_video_topic(topic_node, video, lang_obj) + match = YOUTUBE_ID_REGEX.match(video_url) + if match: + youtube_id = match.group('youtube_id') + skip_videos = load_skip_videos() + if youtube_id in skip_videos: + return + if video.download_info(): + if video.license_common: + include_video_topic(topic_node, video, lang_obj) + else: + save_skip_videos(video, topic, lang_obj) + CACHE_VIDEO_LIST.append(video.uid) else: save_skip_videos(video, topic, lang_obj) - else: - save_skip_videos(video, topic, lang_obj) + CACHE_VIDEO_LIST.append(video.uid) except Exception as e: print('Error downloading this video:', e) @@ -282,7 +318,7 @@ def create_language_topic(): print("===> error getting language topics: ", e) language_next_int += 4 loop_couter += 1 - + cache_skip_videos() # pp.pprint(data_contents) return main_topic_list @@ -328,6 +364,6 @@ def construct_channel(self, **kwargs): Run this script on the command line using: python sushichef.py -v --reset --token=YOURTOKENHERE9139139f3a23232 """ - + chef = ArvindChef() chef.main() From 7468cb7de1355879c2fc42f86987d09652104cb2 Mon Sep 17 00:00:00 2001 From: Richard Amodia Date: Mon, 25 May 2020 22:16:40 +0800 Subject: [PATCH 2/2] Forgot to uncomment temporary comment. --- sushichef.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/sushichef.py b/sushichef.py index 4768fe6..0507dd1 100755 --- a/sushichef.py +++ b/sushichef.py @@ -66,6 +66,7 @@ YOUTUBE_DOMAINS = ["youtu.be", "youtube.com"] +# Include this at script argv to delete CACHE_SKIP_VIDEOS_PATH CLEAR_SKIP_CACHE = "--clear-skip-cache" DEBUG_MODE = True # Print extra debug info durig the chef run (disable in prod) @@ -324,19 +325,19 @@ def create_language_topic(): # print('len(data_contents[lang_name])', len(data_contents[lang_name])) language_topic = TopicNode(title=lang_name.capitalize(), source_id=language_source_id) - # if lang_name_lower not in SINGLE_TOPIC_LANGUAGES and lang_name_lower not in MULTI_LANGUAGE_TOPIC: - # print("=======> This Language is in standard format", lang_name) - # topic_type = STANDARD_TOPIC - # generate_child_topics(data_contents, language_topic, lang_obj, topic_type) - # main_topic_list.append(language_topic) - # print("=====>finished", lang_name) - - # if lang_name_lower in SINGLE_TOPIC_LANGUAGES: - # print("=====> This Language is in single topic format ", lang_name) - # topic_type = SINGLE_TOPIC - # generate_child_topics(data_contents, language_topic, lang_obj, topic_type) - # main_topic_list.append(language_topic) - # print("=====>finished", lang_name) + if lang_name_lower not in SINGLE_TOPIC_LANGUAGES and lang_name_lower not in MULTI_LANGUAGE_TOPIC: + print("=======> This Language is in standard format", lang_name) + topic_type = STANDARD_TOPIC + generate_child_topics(data_contents, language_topic, lang_obj, topic_type) + main_topic_list.append(language_topic) + print("=====>finished", lang_name) + + if lang_name_lower in SINGLE_TOPIC_LANGUAGES: + print("=====> This Language is in single topic format ", lang_name) + topic_type = SINGLE_TOPIC + generate_child_topics(data_contents, language_topic, lang_obj, topic_type) + main_topic_list.append(language_topic) + print("=====>finished", lang_name) if lang_name_lower in MULTI_LANGUAGE_TOPIC: print("=====> This Language is in multiple langauage topic format ", lang_name)