-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape_github_stars.py
More file actions
85 lines (61 loc) · 3.04 KB
/
scrape_github_stars.py
File metadata and controls
85 lines (61 loc) · 3.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 20 11:31:51 2015
@author: NancyLi
"""
import requests
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
# get star history
repo_list = [('druid-io','druid'), ('apache','kafka'), ('apache','storm'), ('amplab','tachyon'), ('apache','spark'), ('docker','docker'), ('mbostock','d3'),('apache','mesos'),('apache','samza')]
star_history_dict = {}
monthly_star_total = {}
for user, repo in repo_list:
star_history_dict[repo] = star_history_list(user, repo)
monthly_star_total[repo] = star_monthly_list(star_history_dict[repo])
# plots
color=iter(plt.cm.rainbow(np.linspace(0,1,len(repo_list))))
for item in repo_list:
c=next(color)
plt.plot(range(1,len(monthly_star_total[item[1]])+1),monthly_star_total[item[1]], c=c)
plt.legend([item[1] for item in repo_list])
plt.ylabel('Total number of stars', fontsize = 14)
plt.xlabel('Number of month since the beginning of repo', fontsize = 14)
plt.title('Graph 4. Comparison of number of stars', fontsize = 18)
### FUNCTIONS ----------------------------------------------------------------------------
def star_history_list(user, repo):
url = 'https://api.github.com/repos/'+ user +'/' + repo +'/stargazers'
headers = {'Accept': 'application/vnd.github.v3.star+json'}
user_name, password = 'Y-Q', 'totallynotmypassword'
per_page_num = 100
r = requests.get(url, auth=(user_name,password), headers = headers, params= {'per_page':per_page_num})
# parse the total page number
if 'link' in r.headers.keys():
link = r.headers['link']
index1, index2 = link[::-1].find('=egap'),link.find('>; rel="last"')
total_pages = int(link[-index1:index2])
else:
total_pages = 1
# iterate through pages to record data of interests
rows = []
for page_num in range(1, total_pages + 1):
params_page = {'per_page':per_page_num, 'page':page_num}
response_list = requests.get(url, auth=(user_name,password), headers = headers, params=params_page).json()
for response in response_list:
rows.append([response['starred_at']] + [response['user']['login']])
star_history_list = pd.DataFrame(rows, columns = ['timestamp','user'])
return star_history_list
def star_monthly_list(star_list):
# aggregate number of new stars by month
star_list['timestamp'] = [datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ' ) for date in star_list['timestamp']]
star_list['year'] = [date.year for date in star_list['timestamp']]
star_list['month'] = [date.month for date in star_list['timestamp']]
star_growth = np.cumsum(np.array(star_list.groupby(['year', 'month']).count()['user']),axis = 0)
startDate = (np.min(star_list['timestamp'])).date()
monthly_dates = pd.date_range(start=startDate,periods=len(star_growth), freq="m")
star_monthly_list = pd.DataFrame(star_growth, index=monthly_dates, columns=['star_growth'])
return star_monthly_list