-
Notifications
You must be signed in to change notification settings - Fork 3
/
arvind.py
148 lines (121 loc) · 4.76 KB
/
arvind.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import json
import os
import re
import youtube_dl
from pressurecooker.youtube import YouTubeResource
from le_utils.constants.languages import getlang_by_name
YOUTUBE_CACHE_DIR = os.path.join('chefdata', 'youtubecache')
YOUTUBE_ID_REGEX = re.compile(r'(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/(watch\?v=|embed/|v/|.+\?v=)?(?P<youtube_id>[A-Za-z0-9\-=_]{11})')
# List of languages not avialble at the le_utils
UND_LANG = {
"marwari":{
"name":"Marwari",
"native_name":"marwari",
"code": "und", # temporary while le-utils updated in Studio
},
"bhojpuri":{
"name":"Bhojpuri",
"native_name":"bhojpuri",
"code":"und", # temporary while le-utils updated in Studio
},
"odiya":{
"name":"Odiya",
"native_name":"odiya",
"code":"or",
},
"sci_edu":{
"name":"Science/Educational",
"native_name":"hindi",
"code":"hi",
},
}
class ArvindLanguage():
name = ''
code = ''
native_name = ''
def __init__(self, name='', code='', native_name=''):
self.name = name.lower()
self.code = code
self.native_name = native_name
def set_value(self, name, code, native_name):
self.name = name
self.code = code
self.native_name = native_name
def get_lang_obj(self):
if self.name != "":
lang_name = self.name
language_obj = getlang_by_name(lang_name)
if not language_obj:
if UND_LANG[self.name]:
self.set_value(UND_LANG[self.name]["name"], UND_LANG[self.name]["code"], UND_LANG[self.name]["native_name"])
return True
else:
self.set_value(language_obj.name, language_obj.code, language_obj.native_name)
return True
return False
class ArvindVideo():
uid = 0 # value from `id` after `youtube_dl.extract_info()`
title = ''
description = ''
url = ''
language = ''
thumbnail = '' # local path to thumbnail image
license = ''
license_common = False
def __init__(self, uid=0, url='', title='', description='', language='',):
self.uid = str(uid)
self.url = url
self.title = title
self.description = description
self.thumbnail = None
self.language = language
self.license_common = False
def __str__(self):
return 'ArvindVideo (%s - %s - %s)' % (self.uid, self.url, self.title)
def download_info(self):
match = YOUTUBE_ID_REGEX.match(self.url)
if not match:
print('==> URL ' + self.url + ' does not match YOUTUBE_ID_REGEX')
return False
youtube_id = match.group('youtube_id')
if not os.path.isdir(YOUTUBE_CACHE_DIR):
os.mkdir(YOUTUBE_CACHE_DIR)
vinfo_json_path = os.path.join(YOUTUBE_CACHE_DIR, youtube_id+'.json')
# First try to get from cache:
vinfo = None
if os.path.exists(vinfo_json_path):
vinfo = json.load(open(vinfo_json_path))
if not vinfo:
# the json data for "Video unavailable" is `null` so can skip them
return False
print("Using cached video info for youtube_id", youtube_id)
# else get using YouTubeResource
if not vinfo:
print("Downloading {} from youtube...".format(self.url))
try:
video = YouTubeResource(self.url)
except youtube_dl.utils.ExtractorError as e:
if "unavailable" in str(e):
print("Video not found at URL: {}".format(self.url))
return False
if video:
try:
vinfo = video.get_resource_info()
# Save the remaining "temporary scraped values" of attributes with actual values
# from the video metadata.
json.dump(vinfo, open(vinfo_json_path, 'w'), indent=4, ensure_ascii=False, sort_keys=True)
except Exception as e:
print(e)
return False
else:
return False
self.uid = vinfo['id'] # video must have id because required to set youtube_id later
self.title = vinfo.get('title', '')
self.description = vinfo.get('description', '')
if not vinfo['license']:
self.license = "Licensed not available"
elif "Creative Commons" in vinfo['license']:
self.license_common = True
else:
self.license = vinfo['license']
return True