-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_extractor.py
More file actions
executable file
·86 lines (68 loc) · 2.98 KB
/
data_extractor.py
File metadata and controls
executable file
·86 lines (68 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import urllib2
import json
# find out the top 10 languages with projects that have over n stars
# find out the top n projects by star counts in each languages, with
# {id, name, html_url, owner.type, stargazers_count, forks_count, #of contributors, # of total commits, children:[contributors]}
# for each contributor in the children, {"name":..., "commits":..., size":1}
LANGUAGES = ['javascript', 'objective-c', 'ruby', 'java', 'python', 'html', 'c', 'go', 'php', 'c%2B%2B']
TEST = ['javascript', 'ruby']
LIMIT = 10
PROJECT_LIMIT = 6
MASTER_LIST = {'name':'Top 10 Languages', 'children':[]}
SEARCH_REPO_URL = 'https://api.github.com/search/repositories'
CONTRIBUTOR_URL = 'https://api.github.com/repositories/'
CONTRIBUTOR_POSTFIX = '/stats/contributors'
STAR_LIMIT = 3000
TOKEN = 'token ' + '' # fill out your own github token string
# helper for fetching data from github server
def fetchDataFromUrl(url, query=''):
if query != '':
req = urllib2.Request(url+'?q='+query, headers={"Authorization" : TOKEN})
else:
req = urllib2.Request(url, headers={"Authorization" : TOKEN})
opener = urllib2.build_opener()
f = opener.open(req)
data = json.loads(f.read())
return data
def genContributorUrl(url, repo_id, postfix):
return url+str(repo_id)+postfix
# '''
# the script below will
# 1. iterate through the data returned from github
# 2. format the data
# 3. fill the master list
# 4. output to a json file
# '''
for lang in LANGUAGES:
star = STAR_LIMIT
if lang == 'javascript':
star = 20000 # reduce the number of records to be fetched since javascript is popular
q = 'language:'+str(lang)+'+stars:>'+str(star)+'&sort=stars&order=desc'
lang_data = fetchDataFromUrl(SEARCH_REPO_URL, q) # return a list of projects
project_list = {} # to be added in master list
project_list['name'] = lang
project_list['children'] = []
for i in range(PROJECT_LIMIT): # the top number of projects in each language
proj_items = lang_data['items']
proj_item = {}
proj_item['repo_id'] = proj_items[i]['id']
proj_item['name'] = proj_items[i]['name'] # project name
proj_item['owner_name'] = proj_items[i]['owner']['login']
proj_item['owner_type'] = proj_items[i]['owner']['type']
proj_item['html_url'] = proj_items[i]['html_url']
proj_item['stars'] = proj_items[i]['stargazers_count']
proj_item['forks'] = proj_items[i]['forks_count']
proj_item['children'] = [] # contributors
contributors_data = fetchDataFromUrl(genContributorUrl(CONTRIBUTOR_URL, proj_item['repo_id'], CONTRIBUTOR_POSTFIX))
for user in contributors_data: # get all users in that project
contributor = {}
contributor['name'] = user['author']['login']
contributor['user_url'] = user['author']['html_url']
contributor['commits'] = user['total']
contributor['size'] = 1 # for visualization purpose
proj_item['children'].append(contributor)
project_list['children'].append(proj_item)
MASTER_LIST['children'].append(project_list)
# write to file
with open('test.json', 'w') as outputfile:
json.dump(MASTER_LIST, outputfile)