Repo-evaluation-using-GitHub-API/scrape_github_stars.py at master · Y-Q/Repo-evaluation-using-GitHub-API · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 20 11:31:51 2015

@author: NancyLi
"""

import requests
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib


# get star history

repo_list = [('druid-io','druid'), ('apache','kafka'), ('apache','storm'), ('amplab','tachyon'), ('apache','spark'), ('docker','docker'), ('mbostock','d3'),('apache','mesos'),('apache','samza')]

star_history_dict = {}
monthly_star_total = {}

for user, repo in repo_list:
    star_history_dict[repo] = star_history_list(user, repo)
    monthly_star_total[repo] = star_monthly_list(star_history_dict[repo])

# plots

color=iter(plt.cm.rainbow(np.linspace(0,1,len(repo_list))))
for item in repo_list:
    c=next(color)
    plt.plot(range(1,len(monthly_star_total[item[1]])+1),monthly_star_total[item[1]], c=c)
plt.legend([item[1] for item in repo_list])
plt.ylabel('Total number of stars', fontsize = 14)
plt.xlabel('Number of month since the beginning of repo', fontsize = 14)
plt.title('Graph 4. Comparison of number of stars', fontsize = 18)

### FUNCTIONS ----------------------------------------------------------------------------

def star_history_list(user, repo):

    url = 'https://api.github.com/repos/'+ user +'/' + repo +'/stargazers'
    headers = {'Accept': 'application/vnd.github.v3.star+json'}
    user_name, password = 'Y-Q', 'totallynotmypassword'
    per_page_num = 100
    r = requests.get(url, auth=(user_name,password), headers = headers, params= {'per_page':per_page_num})

    # parse the total page number
    if 'link' in r.headers.keys():
        link = r.headers['link']
        index1, index2 = link[::-1].find('=egap'),link.find('>; rel="last"')
        total_pages = int(link[-index1:index2])
    else:
        total_pages = 1

    # iterate through pages to record data of interests
    rows = []
    for page_num in range(1, total_pages + 1):
        params_page = {'per_page':per_page_num, 'page':page_num}
        response_list = requests.get(url, auth=(user_name,password), headers = headers, params=params_page).json()

        for response in response_list:
            rows.append([response['starred_at']] + [response['user']['login']])

    star_history_list = pd.DataFrame(rows, columns = ['timestamp','user'])

    return star_history_list


def star_monthly_list(star_list):
    # aggregate number of new stars by month

    star_list['timestamp'] = [datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ' ) for date in star_list['timestamp']]
    star_list['year'] = [date.year for date in star_list['timestamp']]
    star_list['month'] = [date.month for date in star_list['timestamp']]
    star_growth = np.cumsum(np.array(star_list.groupby(['year', 'month']).count()['user']),axis = 0)

    startDate = (np.min(star_list['timestamp'])).date()
    monthly_dates = pd.date_range(start=startDate,periods=len(star_growth), freq="m")

    star_monthly_list = pd.DataFrame(star_growth, index=monthly_dates, columns=['star_growth'])

    return star_monthly_list