From 0ff6feea6efb40629b49ca4364b373f8f7a2a5ed Mon Sep 17 00:00:00 2001
From: Richard Amodia <richard.amodia@mrpau.com>
Date: Wed, 20 May 2020 23:22:33 +0800
Subject: [PATCH 1/2] Cache skip videos to json file.

---
 arvind.py    |  2 +-
 sushichef.py | 58 ++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/arvind.py b/arvind.py
index efbee0e..0fa88ee 100644
--- a/arvind.py
+++ b/arvind.py
@@ -100,6 +100,7 @@ def download_info(self):
             print('==> URL ' + self.url + ' does not match YOUTUBE_ID_REGEX')
             return False
         youtube_id = match.group('youtube_id')
+        self.uid = youtube_id  # video must have id because required to set youtube_id later
         if not os.path.isdir(YOUTUBE_CACHE_DIR):
             os.mkdir(YOUTUBE_CACHE_DIR)
         vinfo_json_path = os.path.join(YOUTUBE_CACHE_DIR, youtube_id+'.json')
@@ -131,7 +132,6 @@ def download_info(self):
             else:
                 return False
 
-        self.uid = vinfo['id']  # video must have id because required to set youtube_id later
         self.title = vinfo.get('title', '')
         self.description = vinfo.get('description', '')
         if not vinfo['license']:
diff --git a/sushichef.py b/sushichef.py
index f8772f6..66ce828 100755
--- a/sushichef.py
+++ b/sushichef.py
@@ -5,8 +5,9 @@
 import requests
 import re
 import shutil
+import json
 
-from arvind import ArvindVideo, ArvindLanguage, YOUTUBE_CACHE_DIR
+from arvind import ArvindVideo, ArvindLanguage, YOUTUBE_CACHE_DIR, YOUTUBE_ID_REGEX
 
 from bs4 import BeautifulSoup
 
@@ -28,6 +29,8 @@
 
 SKIP_VIDEOS_PATH = os.path.join(ROOT_DIR_PATH, "skip_videos.txt")
 
+CACHE_VIDEOS_PATH = os.path.join(ROOT_DIR_PATH, "cache_skip_videos.json")
+
 # These are the languages that has no sub topics on its videos.
 SINGLE_TOPIC_LANGUAGES = [
     "bhojpuri", "nepali", "malayalam", "telugu", "bengali", \
@@ -38,6 +41,8 @@
 # List of multiple languages on its topics
 MULTI_LANGUAGE_TOPIC = ["russian", "french",]
 
+CACHE_VIDEO_LIST = []
+
 # This are the estimate total count of arvind gupta toys language contents
 TOTAL_ARVIND_LANG = 23
 
@@ -101,27 +106,58 @@ def save_skip_videos(video, topic, lang_obj):
     text_file.close()
 
 
+def load_skip_videos():
+    data = []
+    if not os.path.exists(CACHE_VIDEOS_PATH):
+        return data
+    with open(CACHE_VIDEOS_PATH) as json_file:
+        try:
+            data = json.load(json_file)
+            if type(data) == type([]) :
+                return data
+        except:
+            print("Failed to load cache video list")
+    return data          
+
+
+def cache_skip_videos():
+    global CACHE_VIDEO_LIST
+    skip_videos = load_skip_videos()
+    with open(CACHE_VIDEOS_PATH, 'w') as outfile:
+        data = CACHE_VIDEO_LIST + skip_videos
+        json.dump(data, outfile)
+
+
 def download_video_topics(data, topic, topic_node, lang_obj):
     """
     Scrape, collect, and download the videos and their thumbnails.
     """
+    global CACHE_VIDEO_LIST
     pp = pprint.PrettyPrinter()
-    topic_limit = 0
+
     for vinfo in data[topic]:
         try:
+            video_url = vinfo['video_url']
             video = ArvindVideo(
-                url=vinfo['video_url'], 
+                url=video_url, 
                 title=vinfo['video_title'], 
                 language=lang_obj.code)
 
-            if video.download_info():
-
-                if video.license_common:
-                    include_video_topic(topic_node, video, lang_obj)
+            match = YOUTUBE_ID_REGEX.match(video_url)
+            if match:
+                youtube_id = match.group('youtube_id')
+                skip_videos = load_skip_videos()
+                if youtube_id in skip_videos:
+                    return
+                if video.download_info():
+                    if video.license_common:
+                        include_video_topic(topic_node, video, lang_obj)
+                    else:
+                        save_skip_videos(video, topic, lang_obj)
+                        CACHE_VIDEO_LIST.append(video.uid)
                 else:
                     save_skip_videos(video, topic, lang_obj)
-            else:
-                save_skip_videos(video, topic, lang_obj)
+                    CACHE_VIDEO_LIST.append(video.uid)
 
         except Exception as e:
             print('Error downloading this video:', e)
@@ -282,7 +318,7 @@ def create_language_topic():
             print("===> error getting language topics: ", e)
         language_next_int += 4
         loop_couter += 1
-
+    cache_skip_videos()
     # pp.pprint(data_contents)
     return main_topic_list
 
@@ -328,6 +364,6 @@ def construct_channel(self, **kwargs):
     Run this script on the command line using:
         python sushichef.py -v --reset --token=YOURTOKENHERE9139139f3a23232
     """
-
+    
     chef = ArvindChef()
     chef.main()

From 7468cb7de1355879c2fc42f86987d09652104cb2 Mon Sep 17 00:00:00 2001
From: Richard Amodia <richard.amodia@mrpau.com>
Date: Mon, 25 May 2020 22:16:40 +0800
Subject: [PATCH 2/2] Forgot to uncomment temporary comment.

---
 sushichef.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/sushichef.py b/sushichef.py
index 4768fe6..0507dd1 100755
--- a/sushichef.py
+++ b/sushichef.py
@@ -66,6 +66,7 @@
 
 YOUTUBE_DOMAINS = ["youtu.be", "youtube.com"]
 
+# Include this at script argv to delete CACHE_SKIP_VIDEOS_PATH
 CLEAR_SKIP_CACHE = "--clear-skip-cache"
 
 DEBUG_MODE = True  # Print extra debug info durig the chef run (disable in prod)
@@ -324,19 +325,19 @@ def create_language_topic():
                 # print('len(data_contents[lang_name])', len(data_contents[lang_name]))
                 language_topic = TopicNode(title=lang_name.capitalize(), source_id=language_source_id)
 
-                # if lang_name_lower not in SINGLE_TOPIC_LANGUAGES and lang_name_lower not in MULTI_LANGUAGE_TOPIC:
-                #     print("=======> This Language is in standard format", lang_name)
-                #     topic_type = STANDARD_TOPIC
-                #     generate_child_topics(data_contents, language_topic, lang_obj, topic_type)
-                #     main_topic_list.append(language_topic)
-                #     print("=====>finished", lang_name)
-
-                # if lang_name_lower in SINGLE_TOPIC_LANGUAGES:
-                #     print("=====> This Language is in single topic format ", lang_name)
-                #     topic_type = SINGLE_TOPIC
-                #     generate_child_topics(data_contents, language_topic, lang_obj, topic_type)
-                #     main_topic_list.append(language_topic)
-                #     print("=====>finished", lang_name)
+                if lang_name_lower not in SINGLE_TOPIC_LANGUAGES and lang_name_lower not in MULTI_LANGUAGE_TOPIC:
+                    print("=======> This Language is in standard format", lang_name)
+                    topic_type = STANDARD_TOPIC
+                    generate_child_topics(data_contents, language_topic, lang_obj, topic_type)
+                    main_topic_list.append(language_topic)
+                    print("=====>finished", lang_name)
+
+                if lang_name_lower in SINGLE_TOPIC_LANGUAGES:
+                    print("=====> This Language is in single topic format ", lang_name)
+                    topic_type = SINGLE_TOPIC
+                    generate_child_topics(data_contents, language_topic, lang_obj, topic_type)
+                    main_topic_list.append(language_topic)
+                    print("=====>finished", lang_name)
 
                 if lang_name_lower in MULTI_LANGUAGE_TOPIC:
                     print("=====> This Language is in multiple langauage topic format ", lang_name)