-
Notifications
You must be signed in to change notification settings - Fork 6
Expand file tree
/
Copy pathgithub_search_api.py
More file actions
96 lines (71 loc) · 3.84 KB
/
github_search_api.py
File metadata and controls
96 lines (71 loc) · 3.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Import required modules
import requests
import time
import csv
# Paste your Access token here
# To create an access token - https://github.com/settings/tokens
token = "access_token=" + "access_token_here"
# Base API Endpoint
base_api_url = 'https://api.github.com/'
# Enter multiple word queries with a '+' sign
# Ex: machine+learning to search for Machine Learning
print('Enter the Search Query to get the Data ')
query = input()
print('\n Query entered is', query, '\n')
search_final_url = base_api_url + 'search/repositories?q=' + query + '&' + token
# A CSV file containting the data would be saved with the name as the query
# Ex: machine+learning.csv
filename = query + '.csv'
# Create a CSV file or clear the existing one with the same name
with open(filename, 'w', newline='') as csvfile:
write_to_csv = csv.writer(csvfile, delimiter='|')
# GitHub returns information of only 30 repositories with every request
# The Search API Endpoint only allows upto 1000 results, hence the range has been set to 35
for page in range(1, 35):
# Building the Search API URL
search_final_url = base_api_url + 'search/repositories?q=' + \
query + '&page=' + str(page) + '&' + token
# try-except block just incase you set up the range in the above for loop beyond 35
try:
response = requests.get(search_final_url).json()
except:
print("Issue with GitHub API, Check your token")
# Parsing through the response of the search query
for item in response['items']:
# Append to the CSV file
with open(filename, 'a', newline='') as csvfile:
write_to_csv = csv.writer(csvfile, delimiter='|')
repo_name = item['name']
repo_description = item['description']
repo_stars = item['stargazers_count']
repo_watchers = item['watchers_count']
repo_forks = item['forks_count']
repo_issues_count = item['open_issues_count']
repo_main_language = item['language']
repo_license = None
# repo_score is the relevancy score of a repository to the search query
# Reference - https://developer.github.com/v3/search/#ranking-search-results
repo_score = item['score']
# Many Repositories don't have a license, this is to filter them out
if item['license']:
repo_license = item['license']['name']
else:
repo_license = "NO LICENSE"
# Just incase, you face any issue with GitHub API Rate Limiting, use the sleep function as a workaround
# Reference - https://developer.github.com/v3/search/#rate-limit
# time.sleep(10)
# Languages URL to access all the languages present in the repository
language_url = item['url'] + '/languages?' + token
language_response = requests.get(language_url).json()
repo_languages = {}
# Calculation for the percentage of all the languages present in the repository
count_value = sum([value for value in language_response.values()])
for key, value in language_response.items():
key_value = round((value / count_value) * 100, 2)
repo_languages[key] = key_value
print("Repo Name = ", repo_name, "\tDescription", repo_description, "\tStars = ", repo_stars, "\tWatchers = ", repo_watchers, "\tForks = ", repo_forks,
"\tOpen Issues = ", repo_issues_count, "\tPrimary Language = ", repo_main_language, "\tRepo Languages =", repo_languages, '\tRepo Score', repo_score)
# Write as a row to the CSV file
write_to_csv.writerow([repo_name, repo_description, repo_stars, repo_watchers, repo_forks,
repo_license, repo_issues_count, repo_score, repo_main_language, repo_languages])
print('==========')