-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathghUpdatedRepos.new.py
138 lines (122 loc) · 4.22 KB
/
ghUpdatedRepos.new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
'''
Script to scrape GitHub repos using the GraphQL API
Obtains all repos that have been updated AFTER a specified date
Scrapes all repos from that date up to the current time
This script accept a token file provided as stdand input, where each line a combinations of token string, start date and end date (space is the separator).
By default, we ran this scipt in 10 multiprocesses, and please change "remaining < #number" accordingly if require more processes.
Tokens from different accounts are recommended.
Potential problems:
1. Token got blocked
2. multiple processes using tokens from same account may result in racing
3. TBD
'''
import requests
import json
import pymongo
from datetime import datetime, timedelta
import time
import sys
# get start and end date, and GITHUB API token from command line
token, begin, end = sys.stdin.readline().strip().split(' ')
try:
datetime.strptime(begin, '%Y-%m-%d')
datetime.strptime(end, '%Y-%m-%d')
except ValueError:
raise ValueError("Incorrect beginning date format, should be YYYY-MM-DD")
# DB info
client = pymongo.MongoClient()
dbName = sys.argv[1] # db name as second arg
collName = sys.argv[2] # coll name as third arg
db = client[dbName]
coll = db[collName]
url = 'https://api.github.com/graphql'
headers = {'Authorization': 'token ' + token}
start = begin + 'T00:00:00Z'
end_time = datetime.strptime(end + 'T00:00:00Z', "%Y-%m-%dT%H:%M:%SZ")
interval = datetime.strptime(start, "%Y-%m-%dT%H:%M:%SZ")
total = 0
remaining = 5000
# query that specifies which repos and what content to extract
query = '''{
rateLimit {
cost
remaining
resetAt
}
search(query: "is:public archived:false pushed:%s..%s", type: REPOSITORY, first: 100) {
repositoryCount
pageInfo {
hasNextPage
endCursor
startCursor
}
edges {
node {
... on Repository {
nameWithOwner
updatedAt
createdAt
isFork
id
description
}
}
}
}
}'''
jsonS = { 'query': query }
# wait for reset if we exhaust our number of calls
def wait(reset):
now = datetime.now()
then = datetime.strptime(reset, "%Y-%m-%dT%H:%M:%SZ")
wait = (then-now).total_seconds() + 30
time.sleep(wait)
# helper function to loop through and insert repos into mongo db
def gatherData(res):
global total
repos = res['data']['search']['edges']
for repo in repos:
coll.insert(repo['node'])
total += len(repos)
output = "Got {} repos. Total count is {}. Have {} calls remaining."
print(output.format(len(repos), total, remaining))
# driver loop that iterates through repos in 10 minute intervals
# iterates from the specified date up to the current time
while (interval < end_time):
fromStr = interval.strftime("%Y-%m-%dT%H:%M:%SZ")
toStr = (interval + timedelta(minutes=10)).strftime("%Y-%m-%dT%H:%M:%SZ")
nextQuery = query % (fromStr, toStr)
jsonS['query'] = nextQuery
if (token == ''):
print("Please provide your Github API token in the script. Exiting.")
sys.exit()
r = requests.post(url=url, json=jsonS, headers=headers)
res = json.loads(r.text)
remaining = res['data']['rateLimit']['remaining']
reset = res['data']['rateLimit']['resetAt']
if remaining < 11:
wait(reset)
repos = res['data']['search']['repositoryCount']
hasNextPage = res['data']['search']['pageInfo']['hasNextPage']
gatherData(res)
# check if we got more than 100 results and need to paginate
while (repos > 100 and hasNextPage):
endCursor = res['data']['search']['pageInfo']['endCursor']
print("Have to paginate, using cursor {}".format(endCursor))
index = nextQuery.find("REPOSITORY") + len("REPOSITORY")
pageQuery = nextQuery[:index] + ',after:"{}"'.format(endCursor) + nextQuery[index:]
jsonS['query'] = pageQuery
r = requests.post(url=url, json=jsonS, headers=headers)
if r.ok:
res = json.loads(r.text)
try:
remaining = res['data']['rateLimit']['remaining']
reset = res['data']['rateLimit']['resetAt']
if remaining < 11:
wait(reset)
repos = res['data']['search']['repositoryCount']
hasNextPage = res['data']['search']['pageInfo']['hasNextPage']
gatherData(res)
except Exception as e:
print(e)
interval += timedelta(minutes=10)