-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
GitHub API scraping for language data
- Loading branch information
Showing
2 changed files
with
4,255 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,221 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 14, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#testing out opening a file\n", | ||
"import re\n", | ||
"\n", | ||
"with open('ascl_github_repos') as fp:\n", | ||
" for line in fp:\n", | ||
" match = re.match(\"^.*github.com/(.*)/(.*)$\", line)\n", | ||
" if match:\n", | ||
" author = match.group(1)\n", | ||
" repo = match.group(2)\n", | ||
" #print(\"author: \" + author + \" repo: \" + repo)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"<Response [200]>\n", | ||
"{'C++': 219978, 'Jupyter Notebook': 46397, 'Makefile': 2142}\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
" Repo Author Language Bytes\n", | ||
"0 Eclairs 0satoken C++ 219978\n", | ||
"1 Eclairs 0satoken Jupyter Notebook 46397\n", | ||
"2 Eclairs 0satoken Makefile 2142\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"#testing out a GET request to the GitHub API and adding it to a DataFrame\n", | ||
"import requests\n", | ||
"import github_config\n", | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"\n", | ||
"df = pd.DataFrame(columns=['Repo', 'Author', 'Language', 'Bytes'])\n", | ||
"\n", | ||
"session = requests.Session()\n", | ||
"session.auth = (github_config.username, github_config.password)\n", | ||
"\n", | ||
"http_base = 'https://api.github.com/repos/'\n", | ||
"r = session.get(http_base+'0satoken/Eclairs'+'/languages')\n", | ||
"print(r)\n", | ||
"json = r.json()\n", | ||
"\n", | ||
"print(json)\n", | ||
"\n", | ||
"for key in json:\n", | ||
" df.loc[0 if pd.isnull(df.index.max()) else df.index.max() + 1] = ['Eclairs', '0satoken', key, json[key]]\n", | ||
" \n", | ||
" '''\n", | ||
" df2 = pd.DataFrame({\n", | ||
" 'Repo':'Eclairs',\n", | ||
" 'Author':'0satoken',\n", | ||
" 'Language':i,\n", | ||
" 'Bytes':[json[i]]\n", | ||
" })\n", | ||
" df = df.append(df2)\n", | ||
" '''\n", | ||
"print(\"\\n\\n\")\n", | ||
"print(df)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
" Repo Author Language Bytes\n", | ||
"0 Eclairs 0satoken C++ 219978\n", | ||
"1 Eclairs 0satoken Jupyter Notebook 46397\n", | ||
"2 Eclairs 0satoken Makefile 2142\n", | ||
"3 Crab.Toolkit.michi2 1054 Python 1199268\n", | ||
"4 Crab.Toolkit.michi2 1054 Shell 862626\n", | ||
"5 Crab.Toolkit.michi2 1054 Fortran 458353\n", | ||
"6 Crab.Toolkit.michi2 1054 Prolog 173935\n", | ||
"7 Crab.Toolkit.michi2 1054 IDL 107971\n", | ||
"8 Crab.Toolkit.michi2 1054 Makefile 2800\n", | ||
"9 PRISM 1313e Python 910299\n", | ||
"10 PRISM 1313e Jupyter Notebook 78845\n", | ||
"11 PRISM 1313e TeX 6351\n", | ||
"12 exostriker 3fon3fonov Python 5826713\n", | ||
"13 exostriker 3fon3fonov Fortran 1747617\n", | ||
"14 exostriker 3fon3fonov Jupyter Notebook 75609\n", | ||
"15 exostriker 3fon3fonov Assembly 20621\n", | ||
"16 exostriker 3fon3fonov C 6542\n", | ||
"17 exostriker 3fon3fonov Shell 5688\n", | ||
"18 exostriker 3fon3fonov C++ 2253\n", | ||
"19 exostriker 3fon3fonov MATLAB 1752\n", | ||
"20 exostriker 3fon3fonov Pascal 1492\n", | ||
"21 fourpisky-core 4pisky HTML 4890745\n", | ||
"22 fourpisky-core 4pisky Python 203440\n", | ||
"23 fourpisky-core 4pisky Jupyter Notebook 13027\n", | ||
"24 fourpisky-core 4pisky Shell 743\n", | ||
"25 AskaryanModule 918particle C++ 47462\n", | ||
"26 AskaryanModule 918particle MATLAB 14572\n", | ||
"27 AskaryanModule 918particle Gnuplot 4028\n", | ||
"28 AskaryanModule 918particle C 1053\n", | ||
"29 AskaryanModule 918particle M 802\n", | ||
"... ... ... ... ...\n", | ||
"4003 OpenMHD zenitani Gnuplot 694\n", | ||
"4004 OpenMHD zenitani Shell 170\n", | ||
"4005 TAP zgazak IDL 346739\n", | ||
"4006 TAP zgazak Prolog 24400\n", | ||
"4007 pyreaclib zingale Fortran 129292\n", | ||
"4008 pyreaclib zingale Jupyter Notebook 82591\n", | ||
"4009 pyreaclib zingale Python 72679\n", | ||
"4010 pyreaclib zingale Makefile 1248\n", | ||
"4011 dacapo_calibration ziotom78 Python 90488\n", | ||
"4012 dacapo_calibration ziotom78 Fortran 6825\n", | ||
"4013 dacapo_calibration ziotom78 Makefile 5039\n", | ||
"4014 dacapo_calibration ziotom78 TeX 4898\n", | ||
"4015 dacapo_calibration ziotom78 MATLAB 3042\n", | ||
"4016 dacapo_calibration ziotom78 Dockerfile 653\n", | ||
"4017 polycomp ziotom78 Python 116573\n", | ||
"4018 megalib zoglauer GLSL 13138267\n", | ||
"4019 megalib zoglauer C++ 9821809\n", | ||
"4020 megalib zoglauer Shell 438070\n", | ||
"4021 megalib zoglauer Makefile 93092\n", | ||
"4022 megalib zoglauer C 28767\n", | ||
"4023 megalib zoglauer Python 9763\n", | ||
"4024 megalib zoglauer Dockerfile 2306\n", | ||
"4025 megalib zoglauer Objective-C 301\n", | ||
"4026 dst zonca Python 29300\n", | ||
"4027 python-qucs zonca Python 8313\n", | ||
"4028 starpy zooniverse Python 53011\n", | ||
"4029 CausticFrog zpenoyre Python 21851\n", | ||
"4030 CausticFrog zpenoyre Jupyter Notebook 3453\n", | ||
"4031 OoT zpenoyre Jupyter Notebook 68132\n", | ||
"4032 OoT zpenoyre Python 9640\n", | ||
"\n", | ||
"[4033 rows x 4 columns]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"#The real deal\n", | ||
"import re\n", | ||
"import requests\n", | ||
"import github_config\n", | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"\n", | ||
"session = requests.Session()\n", | ||
"session.auth = (github_config.username, github_config.password)\n", | ||
"\n", | ||
"http_base = 'https://api.github.com/repos/'\n", | ||
"\n", | ||
"#placeholder column names\n", | ||
"df = pd.DataFrame(columns=['Repo', 'Author', 'Language', 'Bytes'])\n", | ||
"\n", | ||
"with open('ascl_github_repos') as fp:\n", | ||
" for line in fp:\n", | ||
" #if it matches the github repo URL scheme \n", | ||
" # (weeds out malformed duplicates)\n", | ||
" match = re.match(\"^.*github.com/(.*)/(.*)$\", line)\n", | ||
" if match:\n", | ||
" author = match.group(1)\n", | ||
" repo = match.group(2)\n", | ||
" \n", | ||
" #hit the GitHub API\n", | ||
" r = session.get(http_base+author+'/'+repo+'/languages')\n", | ||
" languages = r.json()\n", | ||
" \n", | ||
" for key in languages:\n", | ||
" #adds each language to the end of the dataframe\n", | ||
" df.loc[0 if pd.isnull(df.index.max()) else df.index.max() + 1] = [repo, author, key, languages[key]]\n", | ||
"\n", | ||
" \n", | ||
"print(df)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df.to_csv('languages.csv')" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.7" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.