-
Notifications
You must be signed in to change notification settings - Fork 0
/
categorize_manually.py
64 lines (50 loc) · 2.03 KB
/
categorize_manually.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
from pytube import Channel
import requests
import re
# enter categories here; the values in the dict will be written to the csv
categories = {'night': 1, 'dawn': 2, 'day': 3}
# allows you to filter all videos before going through them manually
def prefilter_video(video):
return video.length > 60 * 10 and video.length < 60 * 120
def print_categories():
for category, in_file_key in categories.items():
print(f'\t ({category}) --> {in_file_key}')
# intermediate function to get the description of a video until PyTube fixes their bug
# https://github.com/pytube/pytube/issues/1626
def get_video_description(youtube_video_url: str):
full_html = requests.get(youtube_video_url).text
y = re.search(r'shortDescription":"', full_html)
desc = ""
count = y.start() + 19 # adding the length of the 'shortDescription":"
while True:
# get the letter at current index in text
letter = full_html[count]
if letter == "\"":
if full_html[count - 1] == "\\":
# this is case where the letter before is a backslash, meaning it is not real end of description
desc += letter
count += 1
else:
break
else:
desc += letter
count += 1
return desc
channel_url = input('Enter channel URL: ')
channel = Channel(channel_url)
videos = list(channel.videos)
filtered = [video for video in videos if prefilter_video(video)]
with open('output.csv', "w+") as output_file:
for index, video in enumerate(filtered):
video_description = get_video_description(video.watch_url)
print()
print(f'### video {index + 1} of {len(filtered)} ###')
print(f'{video.title} ({video.length/60} minutes)')
print()
print(video_description)
print()
print('Which category does this video belong to:')
print_categories()
key = input('enter key or enter i to ignore: ')
if key != 'i':
output_file.write(f'{video.watch_url},{key}\n')