From 3bcd7b544431b702755a6a12c83d205311ca417a Mon Sep 17 00:00:00 2001 From: siddharm Date: Wed, 19 Aug 2020 18:36:26 -0400 Subject: [PATCH] updated file names to remove space --- ASCL_language_statistics.ipynb | 221 +++++++++ Language-Date_visualizations.ipynb | 707 +++++++++++++++++++++++++++++ 2 files changed, 928 insertions(+) create mode 100644 ASCL_language_statistics.ipynb create mode 100644 Language-Date_visualizations.ipynb diff --git a/ASCL_language_statistics.ipynb b/ASCL_language_statistics.ipynb new file mode 100644 index 0000000..e6b83ef --- /dev/null +++ b/ASCL_language_statistics.ipynb @@ -0,0 +1,221 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "#testing out opening a file\n", + "import re\n", + "\n", + "with open('ascl_github_repos') as fp:\n", + " for line in fp:\n", + " match = re.match(\"^.*github.com/(.*)/(.*)$\", line)\n", + " if match:\n", + " author = match.group(1)\n", + " repo = match.group(2)\n", + " #print(\"author: \" + author + \" repo: \" + repo)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "{'C++': 219978, 'Jupyter Notebook': 46397, 'Makefile': 2142}\n", + "\n", + "\n", + "\n", + " Repo Author Language Bytes\n", + "0 Eclairs 0satoken C++ 219978\n", + "1 Eclairs 0satoken Jupyter Notebook 46397\n", + "2 Eclairs 0satoken Makefile 2142\n" + ] + } + ], + "source": [ + "#testing out a GET request to the GitHub API and adding it to a DataFrame\n", + "import requests\n", + "import github_config\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "df = pd.DataFrame(columns=['Repo', 'Author', 'Language', 'Bytes'])\n", + "\n", + "session = requests.Session()\n", + "session.auth = (github_config.username, github_config.password)\n", + "\n", + "http_base = 'https://api.github.com/repos/'\n", + "r = session.get(http_base+'0satoken/Eclairs'+'/languages')\n", + "print(r)\n", + "json = r.json()\n", + "\n", + "print(json)\n", + "\n", + "for key in json:\n", + " df.loc[0 if pd.isnull(df.index.max()) else df.index.max() + 1] = ['Eclairs', '0satoken', key, json[key]]\n", + " \n", + " '''\n", + " df2 = pd.DataFrame({\n", + " 'Repo':'Eclairs',\n", + " 'Author':'0satoken',\n", + " 'Language':i,\n", + " 'Bytes':[json[i]]\n", + " })\n", + " df = df.append(df2)\n", + " '''\n", + "print(\"\\n\\n\")\n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Repo Author Language Bytes\n", + "0 Eclairs 0satoken C++ 219978\n", + "1 Eclairs 0satoken Jupyter Notebook 46397\n", + "2 Eclairs 0satoken Makefile 2142\n", + "3 Crab.Toolkit.michi2 1054 Python 1199268\n", + "4 Crab.Toolkit.michi2 1054 Shell 862626\n", + "5 Crab.Toolkit.michi2 1054 Fortran 458353\n", + "6 Crab.Toolkit.michi2 1054 Prolog 173935\n", + "7 Crab.Toolkit.michi2 1054 IDL 107971\n", + "8 Crab.Toolkit.michi2 1054 Makefile 2800\n", + "9 PRISM 1313e Python 910299\n", + "10 PRISM 1313e Jupyter Notebook 78845\n", + "11 PRISM 1313e TeX 6351\n", + "12 exostriker 3fon3fonov Python 5826713\n", + "13 exostriker 3fon3fonov Fortran 1747617\n", + "14 exostriker 3fon3fonov Jupyter Notebook 75609\n", + "15 exostriker 3fon3fonov Assembly 20621\n", + "16 exostriker 3fon3fonov C 6542\n", + "17 exostriker 3fon3fonov Shell 5688\n", + "18 exostriker 3fon3fonov C++ 2253\n", + "19 exostriker 3fon3fonov MATLAB 1752\n", + "20 exostriker 3fon3fonov Pascal 1492\n", + "21 fourpisky-core 4pisky HTML 4890745\n", + "22 fourpisky-core 4pisky Python 203440\n", + "23 fourpisky-core 4pisky Jupyter Notebook 13027\n", + "24 fourpisky-core 4pisky Shell 743\n", + "25 AskaryanModule 918particle C++ 47462\n", + "26 AskaryanModule 918particle MATLAB 14572\n", + "27 AskaryanModule 918particle Gnuplot 4028\n", + "28 AskaryanModule 918particle C 1053\n", + "29 AskaryanModule 918particle M 802\n", + "... ... ... ... ...\n", + "4003 OpenMHD zenitani Gnuplot 694\n", + "4004 OpenMHD zenitani Shell 170\n", + "4005 TAP zgazak IDL 346739\n", + "4006 TAP zgazak Prolog 24400\n", + "4007 pyreaclib zingale Fortran 129292\n", + "4008 pyreaclib zingale Jupyter Notebook 82591\n", + "4009 pyreaclib zingale Python 72679\n", + "4010 pyreaclib zingale Makefile 1248\n", + "4011 dacapo_calibration ziotom78 Python 90488\n", + "4012 dacapo_calibration ziotom78 Fortran 6825\n", + "4013 dacapo_calibration ziotom78 Makefile 5039\n", + "4014 dacapo_calibration ziotom78 TeX 4898\n", + "4015 dacapo_calibration ziotom78 MATLAB 3042\n", + "4016 dacapo_calibration ziotom78 Dockerfile 653\n", + "4017 polycomp ziotom78 Python 116573\n", + "4018 megalib zoglauer GLSL 13138267\n", + "4019 megalib zoglauer C++ 9821809\n", + "4020 megalib zoglauer Shell 438070\n", + "4021 megalib zoglauer Makefile 93092\n", + "4022 megalib zoglauer C 28767\n", + "4023 megalib zoglauer Python 9763\n", + "4024 megalib zoglauer Dockerfile 2306\n", + "4025 megalib zoglauer Objective-C 301\n", + "4026 dst zonca Python 29300\n", + "4027 python-qucs zonca Python 8313\n", + "4028 starpy zooniverse Python 53011\n", + "4029 CausticFrog zpenoyre Python 21851\n", + "4030 CausticFrog zpenoyre Jupyter Notebook 3453\n", + "4031 OoT zpenoyre Jupyter Notebook 68132\n", + "4032 OoT zpenoyre Python 9640\n", + "\n", + "[4033 rows x 4 columns]\n" + ] + } + ], + "source": [ + "#The real deal\n", + "import re\n", + "import requests\n", + "import github_config\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "session = requests.Session()\n", + "session.auth = (github_config.username, github_config.password)\n", + "\n", + "http_base = 'https://api.github.com/repos/'\n", + "\n", + "#placeholder column names\n", + "df = pd.DataFrame(columns=['Repo', 'Author', 'Language', 'Bytes'])\n", + "\n", + "with open('ascl_github_repos') as fp:\n", + " for line in fp:\n", + " #if it matches the github repo URL scheme \n", + " # (weeds out malformed duplicates)\n", + " match = re.match(\"^.*github.com/(.*)/(.*)$\", line)\n", + " if match:\n", + " author = match.group(1)\n", + " repo = match.group(2)\n", + " \n", + " #hit the GitHub API\n", + " r = session.get(http_base+author+'/'+repo+'/languages')\n", + " languages = r.json()\n", + " \n", + " for key in languages:\n", + " #adds each language to the end of the dataframe\n", + " df.loc[0 if pd.isnull(df.index.max()) else df.index.max() + 1] = [repo, author, key, languages[key]]\n", + "\n", + " \n", + "print(df)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv('languages.csv')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Language-Date_visualizations.ipynb b/Language-Date_visualizations.ipynb new file mode 100644 index 0000000..91792a3 --- /dev/null +++ b/Language-Date_visualizations.ipynb @@ -0,0 +1,707 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 206, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "df = pd.read_csv('language_data_with_dates.csv')\n", + "#codes where df[df['ascl-id'] == 0] need to be dealt with when thinking about dates" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"\\nfig1, ax1 = plt.subplots()\\nax1.pie(lang[lang.columns[1]], labels=lang[lang.columns[0]], autopct='%1.1f%%',\\n shadow=True, startangle=90)\\nax1.axis('equal')\\n\"" + ] + }, + "execution_count": 207, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#A pie chart of the total language stats\n", + "\n", + "#cast the Bytes column to int64\n", + "df = df[pd.to_numeric(df.Bytes,errors='coerce').notnull()]\n", + "df.Bytes = df['Bytes'].astype(str).astype(int)\n", + "df.sort_values(by='Bytes', ascending=False)\n", + "\n", + "#group the rows by languages and sum on the Bytes column\n", + "lang = df.groupby('Language', as_index=False)['Bytes'].sum()\n", + "\n", + "#Sed takes up a lot because one repository was\n", + "#using the file extension for their own data, not code\n", + "#Jupyter notebooks can be excluded, since they may contain \n", + "#images which can artificially inflate the size\n", + "lang = lang[lang['Language'] != \"sed\"]\n", + "lang = lang[lang['Language'] != \"Jupyter Notebook\"]\n", + "\n", + "#create a new column for the percent that language occupies\n", + "lang['% of total'] = lang.Bytes / lang.Bytes.sum() * 100\n", + "\n", + "#filtering out irrelevant data\n", + "lang = lang[lang['% of total'] > 0.001]\n", + "\n", + "lang = lang.sort_values(by='% of total', ascending=False)\n", + "\n", + "#Pie chart time!\n", + "\n", + "#this creates a pie chart without consolidating languages\n", + "'''\n", + "fig1, ax1 = plt.subplots()\n", + "ax1.pie(lang[lang.columns[1]], labels=lang[lang.columns[0]], autopct='%1.1f%%',\n", + " shadow=True, startangle=90)\n", + "ax1.axis('equal')\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Language Bytes % of total\n", + "0 Python 289004288 18.163067\n", + "1 Fortran 263411241 16.554619\n", + "2 C 244733012 15.380748\n", + "3 C++ 218327962 13.721268\n", + "4 All other languages, each < 2% of total 191282135 12.021518\n", + "5 HTML 95307284 5.989782\n", + "6 Java 63113450 3.966494\n", + "7 Shell 54842464 3.446687\n", + "8 Ruby 53948642 3.390513\n", + "9 TeX 44539192 2.799157\n", + "10 OpenEdge ABL 36927794 2.320803\n", + "11 IDL 35571569 2.235568\n" + ] + }, + { + "data": { + "text/plain": [ + "(-1.1125275989249963,\n", + " 1.1061580341722685,\n", + " -1.101081319684025,\n", + " 1.1000514928489695)" + ] + }, + "execution_count": 208, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Now, create a pie chart with languages <2% consolidated into one \"Other Languages\" entry\n", + "\n", + "#calculate percent sum\n", + "#calculate bytes sum\n", + "#filter lang dataframe for > 2%\n", + "#create new dataframe row of sum data and append to filtered dataframe\n", + "\n", + "find_sum = lang[lang['% of total'] < 2].sum()\n", + "\n", + "pct_sum = find_sum['% of total']\n", + "bytes_sum = find_sum['Bytes']\n", + "\n", + "new_row = [\"All other languages, each < 2% of total\",\n", + " bytes_sum,\n", + " pct_sum]\n", + "\n", + "most = lang[lang['% of total'] >= 2].reset_index().drop(columns=['index'])\n", + "most.loc[len(most)] = new_row\n", + "most = most.sort_values(by='% of total', ascending=False).reset_index().drop(columns=['index'])\n", + "print(most)\n", + "\n", + "\n", + "fig2, ax2 = plt.subplots()\n", + "ax2.pie(most[most.columns[1]], labels=most[most.columns[0]], autopct='%1.1f%%',\n", + " shadow=True, startangle=90)\n", + "ax2.axis('equal')" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "metadata": {}, + "outputs": [], + "source": [ + "#function to return the correct year depending on the first 2 digits of the ascl-id\n", + "def ascl_year(row):\n", + " year = str(row['ascl-id'])[0:2]\n", + " try:\n", + " year = int(year)\n", + " except:\n", + " year = float(year)\n", + " if year > 89:\n", + " return 1900+int(year)\n", + " elif year == 0:\n", + " return None\n", + " else:\n", + " return 2000+int(year)\n", + "\n", + "#now, a function to generate the repo year\n", + "def repo_year(row):\n", + " return row['repo_date'][0:4]\n", + "\n", + "df['ascl_year'] = df.apply (lambda row: ascl_year(row), axis=1)\n", + "df.ascl_year = df.ascl_year.fillna(0.0).astype(int)\n", + "df['repo_year'] = df.apply (lambda row: repo_year(row), axis=1)\n", + "df.repo_year = df.repo_year.fillna(\"0\").astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearJavaCC++PythonRubyShell
20080.0000000.5780530.0580890.3613690.0000000.002489
20090.0000000.0016070.4176360.5665790.0000000.014178
20100.0000000.0091530.2706740.7161610.0000000.004011
20110.0000000.1391550.0242590.7860910.0000000.050494
20120.0178040.4566890.1401290.3338530.0000000.051525
20130.2401660.3964100.1659450.1679470.0149560.014577
20140.1314480.3736470.1639970.3164730.0000000.014435
20150.0016000.2194180.2097170.2913590.2695030.008403
20160.0323610.2059680.2348690.4896170.0155210.021665
20170.0000000.1375360.4250450.2367150.0000030.200702
20180.0007080.1862190.2086670.5708270.0000050.033575
20190.1928660.1363250.1532220.5136840.0000000.003903
2020NaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + "Year Java C C++ Python Ruby Shell\n", + "2008 0.000000 0.578053 0.058089 0.361369 0.000000 0.002489\n", + "2009 0.000000 0.001607 0.417636 0.566579 0.000000 0.014178\n", + "2010 0.000000 0.009153 0.270674 0.716161 0.000000 0.004011\n", + "2011 0.000000 0.139155 0.024259 0.786091 0.000000 0.050494\n", + "2012 0.017804 0.456689 0.140129 0.333853 0.000000 0.051525\n", + "2013 0.240166 0.396410 0.165945 0.167947 0.014956 0.014577\n", + "2014 0.131448 0.373647 0.163997 0.316473 0.000000 0.014435\n", + "2015 0.001600 0.219418 0.209717 0.291359 0.269503 0.008403\n", + "2016 0.032361 0.205968 0.234869 0.489617 0.015521 0.021665\n", + "2017 0.000000 0.137536 0.425045 0.236715 0.000003 0.200702\n", + "2018 0.000708 0.186219 0.208667 0.570827 0.000005 0.033575\n", + "2019 0.192866 0.136325 0.153222 0.513684 0.000000 0.003903\n", + "2020 NaN NaN NaN NaN NaN NaN" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#link to how to get stacked area chart\n", + "#https://python-graph-gallery.com/255-percentage-stacked-area-chart/\n", + "\n", + "#x axis - year\n", + "#y axis - percentage of the total\n", + "#for each language, we need an array of languages over years\n", + "\n", + "#let's only take a look at the most popular languages\n", + "\n", + "#each language is a separate row\n", + "#each column is a year, and each cell represents the bytes of that language in that year\n", + "\n", + "repo_years = np.sort(df.repo_year.unique())\n", + "\n", + "#can use either of these, or define your own subset to get different information\n", + "most_bytes_langs = ['Java', 'C', 'C++', 'Python', 'R', 'Fortran', 'Ruby', 'HTML', 'Shell', 'Others']\n", + "language_subset = ['Java', 'C', 'C++', 'Python', 'Ruby', 'Shell']\n", + "\n", + "requested_langs = language_subset\n", + "repo_df = pd.DataFrame({'Language': requested_langs})\n", + "\n", + "def bytes_that_year(row, year):\n", + " #print('trying ' + row['Language'] + ' in year ' + str(year))\n", + " \n", + " if row['Language'] == 'Others':\n", + " #sum up all the non-request_langs bytes\n", + " return df[ ~(df['Language'].isin(requested_langs)) & (df['repo_year'] == year)]['Bytes'].sum()\n", + " lang = row['Language']\n", + " \n", + " try:\n", + " return df[df['repo_year'] == year].groupby('Language').sum().loc[lang]['Bytes']\n", + " except:\n", + " return 0\n", + "\n", + " \n", + "#for each year in repo_years, create a column\n", + "# each cell will be the number of bytes in that year for that language \n", + "\n", + "for year in repo_years:\n", + " #create columns for the total number of bytes\n", + " arr = repo_df.apply (lambda row: bytes_that_year(row, year), axis=1)\n", + " #col_name = str(year) + \" bytes\"\n", + " #repo_df[col_name] = arr\n", + " total_bytes = arr.sum()\n", + " pct_col_name = str(year) #+ \" pct\"\n", + " repo_df[pct_col_name] = arr.divide(total_bytes)\n", + "\n", + "#display(repo_df)\n", + "\n", + "\n", + "#need to get the whole df sideways to make the stackchart easy\n", + "repo_df = repo_df.T\n", + "new_header = repo_df.iloc[0] \n", + "repo_df = repo_df[1:]\n", + "repo_df.columns = new_header\n", + "\n", + "repo_df.columns.name = 'Year'\n", + "repo_df = repo_df.apply(pd.to_numeric, errors='coerce')\n", + "display(repo_df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
YearJavaCC++PythonRubyShell
20080.0000000.5780530.0580890.3613690.0000000.002489
20090.0000000.0016340.4246160.5593350.0000000.014415
20100.0000000.0181080.7428810.2352320.0000000.003779
20110.0000000.1327690.0248020.7908380.0000000.051591
20120.0181740.4630170.1359440.3302770.0000000.052587
20130.2452340.3948220.1682730.1615160.0152710.014883
20140.1356500.3842750.1687590.2967890.0000000.014526
20150.0017230.2145730.2251170.2593760.2902720.008939
20160.0336680.2128530.2366410.4782050.0161470.022486
20170.0000000.1373410.4123990.2339960.0000030.216262
20180.0007250.1896090.2137740.5616290.0000050.034258
20190.2556980.1722780.0706550.4989160.0000000.002454
2020NaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + "Year Java C C++ Python Ruby Shell\n", + "2008 0.000000 0.578053 0.058089 0.361369 0.000000 0.002489\n", + "2009 0.000000 0.001634 0.424616 0.559335 0.000000 0.014415\n", + "2010 0.000000 0.018108 0.742881 0.235232 0.000000 0.003779\n", + "2011 0.000000 0.132769 0.024802 0.790838 0.000000 0.051591\n", + "2012 0.018174 0.463017 0.135944 0.330277 0.000000 0.052587\n", + "2013 0.245234 0.394822 0.168273 0.161516 0.015271 0.014883\n", + "2014 0.135650 0.384275 0.168759 0.296789 0.000000 0.014526\n", + "2015 0.001723 0.214573 0.225117 0.259376 0.290272 0.008939\n", + "2016 0.033668 0.212853 0.236641 0.478205 0.016147 0.022486\n", + "2017 0.000000 0.137341 0.412399 0.233996 0.000003 0.216262\n", + "2018 0.000725 0.189609 0.213774 0.561629 0.000005 0.034258\n", + "2019 0.255698 0.172278 0.070655 0.498916 0.000000 0.002454\n", + "2020 NaN NaN NaN NaN NaN NaN" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#Now, do the same for ascl_year instead of github repo_year\n", + "ascl_years = np.sort(df.ascl_year.unique())\n", + "\n", + "#drop all the codes where the ascl_year is 0 (they have no ascl-id)\n", + "df = df[df['ascl_year'] != 0]\n", + "\n", + "ascl_df = pd.DataFrame({'Language': requested_langs})\n", + "\n", + "\n", + "def bytes_that_year_ascl(row, year):\n", + " #print('trying ' + row['Language'] + ' in year ' + str(year))\n", + " \n", + " if row['Language'] == 'Others':\n", + " #sum up all the non-request_langs bytes\n", + " return df[ ~(df['Language'].isin(requested_langs)) & (df['ascl_year'] == year)]['Bytes'].sum()\n", + " lang = row['Language']\n", + " \n", + " try:\n", + " return df[df['repo_year'] == year].groupby('Language').sum().loc[lang]['Bytes']\n", + " except:\n", + " return 0\n", + "\n", + "for year in repo_years:\n", + " #create columns for the total number of bytes\n", + " arr = ascl_df.apply (lambda row: bytes_that_year_ascl(row, year), axis=1)\n", + " total_bytes = arr.sum()\n", + " pct_col_name = str(year) #+ \" pct\"\n", + " ascl_df[pct_col_name] = arr.divide(total_bytes)\n", + "\n", + "ascl_df = ascl_df.T\n", + "new_header = ascl_df.iloc[0] \n", + "ascl_df = ascl_df[1:]\n", + "ascl_df.columns = new_header\n", + "\n", + "ascl_df.columns.name = 'Year'\n", + "ascl_df = ascl_df.apply(pd.to_numeric, errors='coerce')\n", + "display(ascl_df)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 212, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "#now, let's create the stacked area chart\n", + "\n", + "plt.stackplot( repo_df.index.values, repo_df.T )\n", + "plt.legend(bbox_to_anchor=(1.25, 1), labels=repo_df.columns)\n", + "plt.margins(0,0)\n", + "plt.title('ASCL Language Data Over the Years (Github Years)')\n", + "plt.xlabel('Year')\n", + "plt.ylabel('% of total language use')\n", + "plt.show()\n", + "\n", + "\n", + "plt.stackplot( ascl_df.index.values, ascl_df.T )\n", + "plt.legend(bbox_to_anchor=(1.25, 1), labels=ascl_df.columns)\n", + "plt.margins(0,0)\n", + "plt.title('ASCL Language Data Over the Years (ASCL Years)')\n", + "plt.xlabel('Year')\n", + "plt.ylabel('% of total language use')\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}