-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
100 lines (77 loc) · 3.13 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import time
import sys
import csv
import urllib2
import json
import Queue
from threading import Thread
from threading import Lock
NUMBER_OF_THREADS = 5
OUTPUT_FILE = "output.csv"
repo_properties = ["repo_id","repo_name","repo_url","actor_id","actor_login","actor_avatar_url","actor_url","org_id","org_login","org_url","commits","html_url","forked","description","full_name","lang","watchers","forks_count"]
GithubUrlTemplate = "https://api.github.com/repos/{repo_name}"
QueueLock = Lock()
ResultsLock = Lock()
Results = {}
def ExtractRepos(filePath):
reposQueue = Queue.Queue()
with open(filePath) as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
# skipping header
next(reader)
for row in reader:
if(len(row) != len(repo_properties)):
print "Warning: line format is wrong and will be skipped. Length: " + str(len(row)) + ". Row: " + ','.join(row)
continue
reposQueue.put(dict(zip(repo_properties, row)))
return reposQueue
def WriteRepos():
with open(OUTPUT_FILE, 'w') as csv_file:
writer = csv.writer(csv_file)
for key, value in Results.items():
writer.writerow([key, value])
def main(arg):
if(len(arg) != 1):
raise ValueError("Exactly 1 file is expected as input")
reposQueue = ExtractRepos(arg[0])
workers = []
for i in range(NUMBER_OF_THREADS):
workers.append(Worker(reposQueue, i))
workers[i].start()
for i in range(NUMBER_OF_THREADS):
workers[i].join()
WriteRepos()
class Worker(Thread):
def __init__(self, reposQueue, id):
Thread.__init__(self)
self.reposQueue = reposQueue
self.id = id
def run(self):
while not self.reposQueue.empty():
with QueueLock:
if not (self.reposQueue.empty()):
repo = self.reposQueue.get()
else:
continue
repoAddress = GithubUrlTemplate.replace("{repo_name}", repo["repo_name"])
print "Worker " + str(self.id) +" Working on repo" + str(repoAddress)
try:
contents = urllib2.urlopen(repoAddress).read()
contentSerialized = json.loads(contents)
with ResultsLock:
Results[repo["repo_name"]] = str(contentSerialized["updated_at"])
except urllib2.HTTPError, e:
if e.code == 404:
print "Repo Not Found: " + repo["repo_name"]
if e.code == 403:
print "Throttled"
with QueueLock:
# Adding repo back to the queue since result wasn't received
self.reposQueue.put(repo)
# sleeping until quota reset
resetTimeInUtcEpoch = int(e.headers["X-RateLimit-Reset"])
sleepTimeInSec = resetTimeInUtcEpoch - int(time.time());
print "Thread " + str(self.id) + " sleeps for " + str(sleepTimeInSec) + " seconds"
time.sleep(sleepTimeInSec)
if __name__ == '__main__':
main(sys.argv[1:])