Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added FinalFile.xlsx
Binary file not shown.
415 changes: 415 additions & 0 deletions cmawhinn.ipynb

Large diffs are not rendered by default.

70 changes: 61 additions & 9 deletions README.md → cmawhinn_README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,84 @@
NPM package list

The list of packages is unique to each one of you:
/data/shared/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz
/data/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz
where XX is between 0 and 33: to find your number look at the list below.

### Goal:
1. Download and store data from npm on all your packages on mongodb database:
fdac18mp2, collection: ghrel_yourutkid
1. Identify the packages that have GH repos (based on the stored info)
fdac18mp2, collection: npm_yourutkid, the example code is in readNpm.py
```
# it has to contain value in
record["collected"]["metadata"]["repository"]["url"]
"git+https://github.com//0-.git"
zcat /data/NPMvulnerabilities/NPMpkglist/NPMpkglist_XX.gz | python3 readNpm.py
```
2. For each such package, get a list of all releases. Use Github API:
Please keep in mind that /data/NPMvulnerabilities/ is not on gcloud, only
on da2, so please run it on da2 or copy NPMpkglist_XX.gz to gcloud.

2. Identify the packages that have GH repos (based on the stored info)
```
import pymongo, json, sys
client = pymongo.MongoClient ()
db = client ['fdac18mp2']
id = sys.argv[1] #your utkid
coll = db [ 'npm_' + id]
for r in coll.find():
if 'collected' in r:
r = r['collected']
if 'metadata' in r:
r = r['metadata']
if 'repository' in r:
r = r['repository']
if 'url' in r:
r = r['url']
print (r)
```
The above code is in extrNpm.py. To output the urls:
```
python3 extrNpm.py > myurls
```

3. For each such package, get a list of all releases. Example file is readGit.py (you can use it with the snippet above to get releases). It reads from standard input and populates
releases_yourutkid collection. Reference to Github API:
```
cat myurls | python3 readGit.py
#or
python3 readGit.py < myurls
```
https://developer.github.com/v3/repos/releases/
4. Extract releases from mongodb
```
3. Find no. of commits between the latest and other releases.
import pymongo, json, sys
client = pymongo.MongoClient (host="da1")
db = client ['fdac18mp2']
id = "audris"
coll = db [ 'releases_' + id]
for r in coll.find():
n = r['name']
if 'values' in r:
for v in r['values']:
if 'tag_name' in v:
print (n+';'+v['tag_name'])
```
The above code is in extrRels.py. To output the urls:
```
python3 extrRels.py > myrels
```


5. Find no. of commits between the latest and other releases.

For example:
E.g. https://api.github.com/repos/webpack-contrib/html-loader/compare/v0.5.4...master or https://api.github.com/repos/git/git/compare/v2.2.0-rc1...v2.2.0-rc2
More resource: https://stackoverflow.com/questions/26925312/github-api-how-to-compare-2-commits (look for comparing the tags in the answer)
Get the data from the json, look for something like to get no. of commits between releases
```
"status": "ahead",
"ahead_by": 24,
"behind_by": 0,
"total_commits": 24,
```
For example
```
cat myrels | python3 compareRels.py > myrels.cmp
```

| number | GitHub Username | NetID | Name |
|:-:|:-:|:-:|---|
Expand Down
83 changes: 83 additions & 0 deletions cmawhinn_compareRels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import sys, re, pymongo, json, time
import datetime
from requests.auth import HTTPBasicAuth
import requests
gleft = 1500

#client = pymongo.MongoClient ()
client = pymongo.MongoClient (host="da1.eecs.utk.edu")
login = 'Colsarcol'
passwd = ''

baseurl = 'https://api.github.com/repos'
headers = {'Accept': 'application/vnd.github.v3.star+json'}
headers = {'Accept': 'application/vnd.github.hellcat-preview+json'}

db = client['fdac18mp2'] # added in class
collName = 'releases_cmawhinn'
coll = db [collName]
def wait (left):
while (left < 20):
l = requests .get('https://api.github.com/rate_limit', auth=(login,passwd))
if (l.ok):
left = int (l.headers.get ('X-RateLimit-Remaining'))
reset = int (l.headers.get ('x-ratelimit-reset'))
now = int (time.time ())
dif = reset - now
if (dif > 0 and left < 20):
sys.stderr.write ("waiting for " + str (dif) + "s until"+str(left)+"s\n")
time .sleep (dif)
time .sleep (0.5)
return left

def get (url):
global gleft
gleft = wait (gleft)
values = []
# sys.stderr.write ("left:"+ str(left)+"s\n")
try:
r = requests .get (url, headers=headers, auth=(login, passwd))
time .sleep (0.5)
if (r.ok):
gleft = int(r.headers.get ('X-RateLimit-Remaining'))
lll = r.headers.get ('Link')
links = ['']
if lll is not None:
links = lll.split(',')
except Exception as e:
sys.stderr.write ("Could not get:" + url + ". Exception:" + str(e) + "\n")
return (json.loads(r.text))

def chunks(l, n):
if n < 1: n = 1
return [l[i:i + n] for i in range(0, len(l), n)]

def cmp_rel (url):
v = []
size = 0
try:
v = get (url)
except Exception as e:
sys.stderr.write ("Could not get:" + url + ". Exception:" + str(e) + "\n")
if 'ahead_by' in v and 'behind_by' in v:
print (url+';'+str(v['ahead_by'])+';'+str(v['behind_by']))
else:
sys.stderr.write ("Could not compare releases for: " + url + "; There exists no common ancestor between the two versions." + "\n")


p2r = {}
for l in sys.stdin.readlines():
l = l.rstrip()
p, r = l.split(';')
if p in p2r:
p2r[p] .append (r)
else:
p2r[p] = [r]

for p in p2r:
rs = p2r[p]
if len (rs) > 1:
for i in range(1,len (rs)):
url = 'https://api.github.com/repos/'+p+'/compare/' + rs[i-1] + '...' + rs[i]
cmp_rel (url)

15 changes: 15 additions & 0 deletions cmawhinn_extrNpm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pymongo, json, sys
client = pymongo.MongoClient (host="da1")
db = client ['fdac18mp2']
id = "cmawhinn"
coll = db [ 'npm_' + id]
for r in coll.find():
if 'collected' in r:
r = r['collected']
if 'metadata' in r:
r = r['metadata']
if 'repository' in r:
r = r['repository']
if 'url' in r:
r = r['url']
print (r)
11 changes: 11 additions & 0 deletions cmawhinn_extrRels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import pymongo, json, sys
client = pymongo.MongoClient (host="da1")
db = client ['fdac18mp2']
id = "cmawhinn"
coll = db [ 'releases_' + id]
for r in coll.find():
n = r['name']
if 'values' in r:
for v in r['values']:
if 'tag_name' in v:
print (n+';'+v['tag_name'])
Loading