forked from anam-dodhy/github-query-script
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_matching_code_files.py
140 lines (119 loc) · 4.24 KB
/
get_matching_code_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
'''
Given a list of Github repositories, this script queries those repositories
for Java code files using javax.crypto
It outputs a csv file containing the repository name and the link to the
code file
The query_size is used to avoid running into URL size limits
'''
import json
from github3 import login
import getpass
import csv
import argparse
QUERY_SIZE = 40
class CodeSearch:
def __init__(self, username, repo_file, output_file):
self.github = login(username, password = getpass.getpass())
self.base_query = '"javax.crypto" language:java'
self.repositories = []
self.writer = None
self.repo_file = repo_file
self.output_file = output_file
def read_repositories(self):
'''
Read list of repositories whose code will be queried
'''
repositories = [] #set() is destroying the order of the elements from the file
with open(self.repo_file, 'r') as input_file:
for repo_name in input_file:
index,name,commit_count,commit_url = repo_name.split(";") #each item in repositories.txt file is "repo_name;lastcommitURL"
repo_info = {"index": index,
"name": name.rstrip('\n'),
"commit_count": commit_count,
"commit_url": commit_url }
#repositories.append(name.rstrip('\n'))
repositories.append(repo_info)
input_file.close()
self.repositories = list(repositories)
def write_code_results(self, code_results, repos_to_query):
'''
Given the results of the code search, write each repository and the
matching file url to a csv file
'''
for code_result in code_results:
commit_count=0
commit_url=''
index=0
for repo in repos_to_query:
if str(repo["name"]) == str(code_result.repository):
commit_count = repo["commit_count"]
index = repo["index"]
commit_url = repo["commit_url"]
self.writer.writerow({'index' : index,
'repository' : code_result.repository,
'file_url' : code_result.html_url,
'commit_count' : commit_count,
'commit_url' : commit_url})
def search_code_in_repos(self, repos):
'''
Appends a list of given repos to the base_query
and uses the Github search API to search those repos
'''
query = self.base_query
for repo in repos:
query += " repo:" + repo["name"]
print(query)
return self.github.search_code(query)
def search_code(self, start_index, end_index):
'''
Creates a sublist of the available repositories given the
start and end indices, issues the query, and then writes the
results to the csv file
'''
repos_to_query = self.repositories[start_index:end_index]
code_results = self.search_code_in_repos(repos_to_query)
self.write_code_results(code_results, repos_to_query)
def create_csv_file(self):
'''
Creates the csv file and its header
'''
csvfile = open(self.output_file, 'w')
fieldnames = ['index','repository', 'file_url','commit_count','commit_url']
self.writer = csv.DictWriter(csvfile, fieldnames = fieldnames)
self.writer.writeheader()
def get_repositories(self):
'''
returns the available list of repos
'''
return self.repositories
def get_github_obj(self):
'''
returns the github3 wrapper object
'''
return self.github
def run(username, repoFile, outputFile):
code_search = CodeSearch(username, repoFile, outputFile)
code_search.read_repositories()
code_search.create_csv_file()
num_of_repos = len(code_search.get_repositories())
print 'num of repos',num_of_repos
start_index = 0
end_index = start_index + QUERY_SIZE - 1
if end_index > num_of_repos:
end_index = num_of_repos - 1
while end_index < num_of_repos:
code_search.search_code(start_index, end_index)
start_index += QUERY_SIZE
end_index = start_index + QUERY_SIZE - 1
# in case total # of repos is not divisible by the query size
if (end_index != num_of_repos - 1):
code_search.search_code(end_index + 1, num_of_repos - 1)
parser = argparse.ArgumentParser(description='Process repositories in repoFile and output matching code files in codeFile')
parser.add_argument('--repoFile', help='name of file containing repository list')
parser.add_argument('--outputFile', help='name of output file')
parser.add_argument('--username', help='github user name')
args = parser.parse_args()
print 'repofile', args.repoFile
print 'outputfile', args.outputFile
print 'username', args.username
run(args.username, args.repoFile, args.outputFile)