-
Notifications
You must be signed in to change notification settings - Fork 52
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Be able to extract all videos from a YouTube channel with more than 20,000 videos #255
Comments
import requests
CHANNEL_ID = 'UCf8w5m0YsRa8MHQ5bwSGmbw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]
URL = 'https://yt.lemnoslife.com/noKey/playlistItems'
params = {
'part': ','.join(['snippet']),
'playlistId': PLAYLIST_ID,
'maxResults' : 50,
'fields': 'items/snippet/resourceId/videoId,nextPageToken',
}
videoIds = set()
while True:
response = requests.get(URL, params = params).json()
for item in response['items']:
videoIds.add(item['snippet']['resourceId']['videoId'])
if not 'nextPageToken' in response:
break
params['pageToken'] = response['nextPageToken']
print(len(videoIds)) # 19,997 Based on my Stack Overflow answer 74579030. Note that OP mentions @NBA but it only has 14,451 videos. @FRANCE24 only seems to have 5,873 videos. Both Stack Overflow questions do not mention channels with more than 20,000 videos. @asianetnews seems to have more than 20,000 videos, as it shows on YouTube UI Now that have investigated and simplified import requests
import blackboxprotobuf
import base64
CHANNEL_ID = 'UCf8w5m0YsRa8MHQ5bwSGmbw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
def getPageToken(index):
message = {
'1': index,
}
typedef = {
'1': {
'type': 'int'
},
}
three = getBase64Protobuf(message, typedef)
message = {
'2': 0,
'3': f'PT:{three}'
}
typedef = {
'2': {
'type': 'int'
},
'3': {
'type': 'string'
}
}
pageToken = getBase64Protobuf(message, typedef)
return pageToken
URL = 'https://yt.lemnoslife.com/noKey/playlistItems'
MAX_RESULTS = 50
params = {
'part': ','.join(['snippet']),
'playlistId': PLAYLIST_ID,
'maxResults' : MAX_RESULTS,
'fields': 'items/snippet/resourceId/videoId',
}
videoIds = set()
requestIndex = 0
while True:
response = requests.get(URL, params = params).json()
for item in response['items']:
videoIds.add(item['snippet']['resourceId']['videoId'])
print(len(videoIds))
requestIndex += 1
params['pageToken'] = getPageToken(requestIndex * MAX_RESULTS)
|
import requests
CHANNEL_ID = 'UCf8w5m0YsRa8MHQ5bwSGmbw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]
URL = 'https://yt.lemnoslife.com/playlistItems'
params = {
'part': ','.join(['snippet']),
'playlistId': PLAYLIST_ID,
}
videoIds = set()
while True:
response = requests.get(URL, params = params).json()
for item in response['items']:
videoIds.add(item['snippet']['resourceId']['videoId'])
if not 'nextPageToken' in response:
break
params['pageToken'] = response['nextPageToken']
print(len(videoIds)) # 19,996 Let us check if with the low-level approach I can bypass this limit. Now that have investigated and simplified Used the following to debug the iteration algorithm: import requests
import json
CHANNEL_ID = 'UCf8w5m0YsRa8MHQ5bwSGmbw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]
URL = 'http://localhost/YouTube-operational-API/playlistItems'
MAX_RESULTS = 100
params = {
'part': ','.join(['snippet']),
'playlistId': PLAYLIST_ID,
'pageToken': '4qmFsgIqEhpWTFVVV2VnMlBrYXRlNjlORmRCZXVSRlRBdxoMZWdkUVZEcERSMUU5',
}
response = requests.get(URL, params = params).json()
print(response) diff --git a/playlistItems.php b/playlistItems.php
index d35d5ed..7e19af8 100644
--- a/playlistItems.php
+++ b/playlistItems.php
@@ -67,7 +67,7 @@ function getAPI($playlistId, $continuationToken)
$result = json_decode($res, true);
$answerItems = [];
- $items = $continuationTokenProvided ? getContinuationItems($result) : getTabs($result)[0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents'];
+ $items = $continuationTokenProvided ? $result['continuationContents']['playlistVideoListContinuation']['contents'] : getTabs($result)[0]['tabRenderer']['content']['sectionListRenderer']['contents'][0]['itemSectionRenderer']['contents'][0]['playlistVideoListRenderer']['contents'];
$itemsCount = count($items);
for ($itemsIndex = 0; $itemsIndex < $itemsCount - 1; $itemsIndex++) {
$item = $items[$itemsIndex]; import requests
import blackboxprotobuf
import base64
CHANNEL_ID = 'UCf8w5m0YsRa8MHQ5bwSGmbw'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]
def getBase64Protobuf(message, typedef):
data = blackboxprotobuf.encode_message(message, typedef)
return base64.b64encode(data).decode('ascii')
def getPageToken(index):
message = {
'1': index,
}
typedef = {
'1': {
'type': 'int'
},
}
fifteen = getBase64Protobuf(message, typedef)
message = {
'15': f'PT:{fifteen}'
}
typedef = {
'15': {
'type': 'string'
}
}
three = getBase64Protobuf(message, typedef)
message = {
'80226972': {
'2': f'VL{PLAYLIST_ID}',
'3': three,
}
}
typedef = {
'80226972': {
'type': 'message',
'message_typedef': {
'2': {
'type': 'string'
},
'3': {
'type': 'string'
},
},
'field_order': [
'2',
'3',
]
}
}
continuation = getBase64Protobuf(message, typedef)
return continuation
URL = 'http://localhost/YouTube-operational-API/playlistItems'
MAX_RESULTS = 100
params = {
'part': ','.join(['snippet']),
'playlistId': PLAYLIST_ID,
}
videoIds = set()
requestIndex = 0
while True:
response = requests.get(URL, params = params).json()
for item in response['items']:
videoIds.add(item['snippet']['resourceId']['videoId'])
print(len(videoIds))
requestIndex += 1
params['pageToken'] = getPageToken(requestIndex * MAX_RESULTS)
The last response being |
$ yt-dlp --dump-json "https://www.youtube.com/channel/UCf8w5m0YsRa8MHQ5bwSGmbw/videos" -i | jq -r '[.id]|@csv' | wc -l
I guess that it loops forever when it reaches the 20,000 limit. Note that removing However, without
|
https://www.youtube.com/feeds/videos.xml?channel_id=UCWeg2Pkate69NFdBeuRFTAw lists only the first 15 entries and the pagination does not seem clear, if there is any, got the link from the Stack Overflow answer 31514238. |
import requests
from lxml import html
import json
CHANNEL_HANDLE = '@MLB'
text = requests.get(f'https://www.youtube.com/{CHANNEL_HANDLE}/videos').text
tree = html.fromstring(text)
ytVariableName = 'ytInitialData'
ytVariableDeclaration = ytVariableName + ' = '
for script in tree.xpath('//script'):
scriptContent = script.text_content()
if ytVariableDeclaration in scriptContent:
ytVariableData = json.loads(scriptContent.split(ytVariableDeclaration)[1][:-1])
break
contents = ytVariableData['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['richGridRenderer']['contents']
videoIds = set()
def treatContents(contents):
for content in contents:
if not 'richItemRenderer' in content:
break
videoId = content['richItemRenderer']['content']['videoRenderer']['videoId']
videoIds.add(videoId)
print(len(videoIds))
return getContinuationToken(contents)
def getContinuationToken(contents):
# Sometimes have 29 actual results instead of 30.
lastContent = contents[-1]
if not 'continuationItemRenderer' in lastContent:
exit(0)
return lastContent['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']
continuationToken = treatContents(contents)
url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
'Content-Type': 'application/json'
}
requestData = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20240313.05.00'
}
}
}
while True:
requestData['continuation'] = continuationToken
data = requests.post(url, headers = headers, json = requestData).json()
# Happens not deterministically sometimes.
if not 'onResponseReceivedActions' in data:
print('Retrying')
continue
continuationItems = data['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
continuationToken = treatContents(continuationItems) Got |
What about reversing the order to have 40,000 results instead of 20,000? YouTube Data API v3 PlaylistItems: list endpoint does not have an However, if shuffle correctly work possibly it could lead to something interesting but I have doubts. I am not able to have an interesting shuffle, as it seems only to consider the first page of entries. Have I not written down the script I did to test with YouTube Data API v3 Search: list endpoint with grep -r 'publishedAfter' --include='*.py' does not return interesting results in |
import requests
from lxml import html
import json
showProgress = True
def treatContents(videoIds, contents):
for content in contents:
if not 'richItemRenderer' in content:
break
videoId = content['richItemRenderer']['content']['videoRenderer']['videoId']
videoIds.add(videoId)
if showProgress:
print(len(videoIds))
return getContinuationToken(videoIds, contents)
def getContinuationToken(videoIds, contents):
# Sometimes have 29 actual results instead of 30.
lastContent = contents[-1]
if not 'continuationItemRenderer' in lastContent:
return videoIds
return lastContent['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']
def getChannelVideoIds(channelHandle):
text = requests.get(f'https://www.youtube.com/{channelHandle}/videos').text
tree = html.fromstring(text)
ytVariableName = 'ytInitialData'
ytVariableDeclaration = ytVariableName + ' = '
for script in tree.xpath('//script'):
scriptContent = script.text_content()
if ytVariableDeclaration in scriptContent:
ytVariableData = json.loads(scriptContent.split(ytVariableDeclaration)[1][:-1])
break
contents = ytVariableData['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['richGridRenderer']['contents']
videoIds = set()
continuationToken = treatContents(videoIds, contents)
if type(continuationToken) is set:
return continuationToken
url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
'Content-Type': 'application/json'
}
requestData = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20240313.05.00'
}
}
}
while True:
requestData['continuation'] = continuationToken
try:
data = requests.post(url, headers = headers, json = requestData).json()
except requests.exceptions.SSLError:
print('SSL error, retrying')
continue
# Happens not deterministically sometimes.
if not 'onResponseReceivedActions' in data:
print('Missing onResponseReceivedActions, retrying')
continue
continuationItems = data['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
continuationToken = treatContents(videoIds, continuationItems)
if type(continuationToken) is set:
return continuationToken
# Source: https://youtube.fandom.com/wiki/List_of_YouTube_channels_with_the_most_video_uploads?oldid=1795583
CHANNEL_HANDLES = [
'@RoelVandePaar',
'@Doubtnut',
'@KnowledgeBaseLibrary',
'@betterbandai4163',
'@Hey_Delphi',
'@molecularmagexdshorts3706',
]
url = 'https://yt.lemnoslife.com/channels'
params = {
'part': 'about',
}
for channelHandle in CHANNEL_HANDLES:
params['handle'] = channelHandle
claimedNumberOfVideos = requests.get(url, params = params).json()['items'][0]['about']['stats']['videoCount']
print(f'{channelHandle} claims {claimedNumberOfVideos} videos.')
foundVideoIds = getChannelVideoIds(channelHandle)
print(f'Found {len(foundVideoIds)} videos.') |
Benchmarking: import requests
from lxml import html
import json
from tqdm import tqdm
def treatContents(videoIds, contents):
for content in contents:
if not 'richItemRenderer' in content:
break
videoId = content['richItemRenderer']['content']['videoRenderer']['videoId']
videoIds.add(videoId)
return getContinuationToken(videoIds, contents)
def getContinuationToken(videoIds, contents):
# Sometimes have 29 actual results instead of 30.
lastContent = contents[-1]
if not 'continuationItemRenderer' in lastContent:
return videoIds
return lastContent['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token']
def getChannelVideoIds(channelHandle, claimedNumberOfVideos):
text = requests.get(f'https://www.youtube.com/{channelHandle}/videos').text
tree = html.fromstring(text)
ytVariableName = 'ytInitialData'
ytVariableDeclaration = ytVariableName + ' = '
for script in tree.xpath('//script'):
scriptContent = script.text_content()
if ytVariableDeclaration in scriptContent:
ytVariableData = json.loads(scriptContent.split(ytVariableDeclaration)[1][:-1])
break
contents = ytVariableData['contents']['twoColumnBrowseResultsRenderer']['tabs'][1]['tabRenderer']['content']['richGridRenderer']['contents']
videoIds = set()
continuationToken = treatContents(videoIds, contents)
if type(continuationToken) is set:
return continuationToken
url = 'https://www.youtube.com/youtubei/v1/browse'
headers = {
'Content-Type': 'application/json'
}
requestData = {
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20240313.05.00'
}
}
}
with tqdm(total = claimedNumberOfVideos) as pbar:
while True:
requestData['continuation'] = continuationToken
try:
data = requests.post(url, headers = headers, json = requestData).json()
except requests.exceptions.SSLError:
print('SSL error, retrying')
continue
# Happens not deterministically sometimes.
if not 'onResponseReceivedActions' in data:
print('Missing onResponseReceivedActions, retrying')
with open('error.json', 'w') as f:
json.dump(data, f, indent = 4)
continue
continuationItems = data['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems']
continuationToken = treatContents(videoIds, continuationItems)
if type(continuationToken) is set:
return continuationToken
pbar.update(len(continuationItems))
# Source: https://youtube.fandom.com/wiki/List_of_YouTube_channels_with_the_most_video_uploads?oldid=1795583
CHANNEL_HANDLES = [
'@RoelVandePaar',
'@Doubtnut',
'@KnowledgeBaseLibrary',
'@betterbandai4163',
'@Hey_Delphi',
]
url = 'https://yt.lemnoslife.com/channels'
params = {
'part': 'about',
}
for channelHandle in CHANNEL_HANDLES[::-1]:
params['handle'] = channelHandle
claimedNumberOfVideos = requests.get(url, params = params).json()['items'][0]['about']['stats']['videoCount']
print(f'{channelHandle} claims {claimedNumberOfVideos} videos.')
foundVideoIds = getChannelVideoIds(channelHandle, claimedNumberOfVideos)
print(f'Found {len(foundVideoIds)} videos.') Last progress line shows:
which shows a speed decrease after a few tens of thousands videos. Should count how many temporary errors in comparison with how many successful requests. Note that here I am considering the most many videos channels, hence this is an extreme case. As the process takes a while, I may stop it before its completion. I think just doing half a million videos retrieval to have an idea of the workload. |
error.json Following an error not managed by my algorithm the process stopped, I do not plan to restart this experience as the situation is quite clear. |
Let us verify the correctness of above algorithm for the channel @bwftv: import requests
import json
CHANNEL_ID = 'UChh-akEbUM8_6ghGVnJd6cQ'
PLAYLIST_ID = 'UU' + CHANNEL_ID[2:]
YOUTUBE_OPERATIONAL_API_URL = 'https://yt.lemnoslife.com'
URL = f'{YOUTUBE_OPERATIONAL_API_URL}/noKey/playlistItems'
params = {
'part': ','.join(['snippet']),
'playlistId': PLAYLIST_ID,
'maxResults' : 50,
}
videoIds = set()
while True:
response = requests.get(URL, params = params).json()
for item in response['items']:
videoIds.add(item['snippet']['resourceId']['videoId'])
print(len(videoIds))
nextPageToken = response.get('nextPageToken')
if not nextPageToken:
break
params['pageToken'] = nextPageToken
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
URL = f'{YOUTUBE_OPERATIONAL_API_URL}/noKey/videos'
params = {
'part': ','.join(['liveStreamingDetails']),
}
videoIdsSets = chunks(list(videoIds), 50)
liveIds = set()
for videoIdsSet in videoIdsSets:
params['id'] = ','.join(videoIdsSet)
data = response = requests.get(URL, params = params).json()
for item in data['items']:
if 'liveStreamingDetails' in item:
liveIds.add(item['id'])
print(len(liveIds)) Returns 17,385 videos including 9,012 lives. import requests
import json
CHANNEL_ID = 'UChh-akEbUM8_6ghGVnJd6cQ'
URL = 'https://yt.lemnoslife.com/channels'
params = {
'part': ','.join(['shorts']),
'id': CHANNEL_ID,
}
videoIds = set()
while True:
response = requests.get(URL, params = params).json()
shorts = response['items'][0]
for item in shorts['shorts']:
videoIds.add(item['videoId'])
print(len(videoIds))
nextPageToken = shorts.get('nextPageToken')
if not nextPageToken:
break
params['pageToken'] = nextPageToken Returns 243 and I verified that on @bwftv/shorts. So above algorithm should find 17,385 - 9,012 - 243 = 8,130 videos and it is quite what we find: python3 test.py
So above algorithm works well. |
I faced this issue multiple times:
and this Stack Overflow question is asking the solution.
The text was updated successfully, but these errors were encountered: