Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 150 additions & 0 deletions lpassare.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"import sys\n",
"import re\n",
"import pymongo\n",
"import json\n",
"import time\n",
"import datetime\n",
"import requests\n",
"\n",
"dbname = \"fdac18mp2\" #please use this database\n",
"collname = \"glprj_lpassare\" #please modify so you store data in your collection\n",
"# beginning page index\n",
"begin = \"0\"\n",
"client = pymongo.MongoClient()\n",
"\n",
"db = client[dbname]\n",
"coll = db[collname]\n",
"letter=\"b\"\n",
"\n",
"beginurl = \"https://gitlab.com/api/v4/projects?archived=false&membership=false&order_by=created_at&owned=false&page=\" + begin + \\\n",
" \"&per_page=99&simple=false&sort=desc&starred=false&statistics=false&with_custom_attributes=false&with_issues_enabled=false&with_merge_requests_enabled=false\"\n",
"\n",
"\n",
"gleft = 0\n",
"\n",
"header = {'per_page': 99}\n",
"\n",
"# check remaining query chances for rate-limit restriction\n",
"def wait(left):\n",
" global header\n",
" while (left < 20):\n",
" l = requests.get('https://gitlab.com/api/v4/projects', headers=header)\n",
" if (l.ok):\n",
" left = int(l.headers.get('RateLimit-Remaining'))\n",
" time .sleep(60)\n",
" return left\n",
"\n",
"# send queries and extract urls \n",
"def get(url, coll):\n",
"\n",
" global gleft\n",
" global header\n",
" global bginnum\n",
" gleft = wait(gleft)\n",
" values = []\n",
" size = 0\n",
"\n",
" try:\n",
" r = requests .get(url, headers=header)\n",
" time .sleep(0.5)\n",
" # got blocked\n",
" if r.status_code == 403:\n",
" return \"got blocked\", str(bginnum)\n",
" if (r.ok):\n",
"\n",
" gleft = int(r.headers.get('RateLimit-Remaining'))\n",
" lll = r.headers.get('Link')\n",
" t = r.text\n",
" array = json.loads(t)\n",
" \n",
" for el in array:\n",
" if el['name'].lower().startswith(letter):\n",
" el['site'] = \"git\"\n",
" count += 1\n",
" coll.insert_one(el)\n",
" if count > 49:\n",
" return\n",
" \n",
" \n",
" #next page\n",
" while ('; rel=\"next\"' in lll):\n",
" gleft = int(r.headers.get('RateLimit-Remaining'))\n",
" gleft = wait(gleft)\n",
" # extract next page url\n",
" ll = lll.replace(';', ',').split(',')\n",
" url = ll[ll.index(' rel=\"next\"') -\n",
" 1].replace('<', '').replace('>', '').lstrip()\n",
" \n",
" try:\n",
" r = requests .get(url, headers=header)\n",
" if r.status_code == 403:\n",
" return \"got blocked\", str(bginnum)\n",
" if (r.ok):\n",
" lll = r.headers.get('Link')\n",
" t = r.text\n",
" array1 = json.loads(t)\n",
" for el in array1:\n",
" coll.insert(el)\n",
" else:\n",
" sys.stderr.write(\"url can not found:\\n\" + url + '\\n')\n",
" return \n",
" except requests.exceptions.ConnectionError:\n",
" sys.stderr.write('could not get ' + url + '\\n')\n",
"\n",
" else:\n",
" sys.stderr.write(\"url can not found:\\n\" + url + '\\n')\n",
" return\n",
"\n",
" except requests.exceptions.ConnectionError:\n",
" sys.stderr.write('could not get ' + url + '\\n')\n",
" except Exception as e:\n",
" sys.stderr.write(url + ';' + str(e) + '\\n')\n",
" \n",
"#start retrieving \n",
"get(beginurl,coll)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import gitlab \n",
"gl = gitlab.Gitlab('http://10.0.0.1')\n",
"gl.search('projects','b',per_page=50)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}