Skip to content

Commit

Permalink
files added
Browse files Browse the repository at this point in the history
  • Loading branch information
lijoatimaginea committed Nov 10, 2020
0 parents commit b3efdac
Show file tree
Hide file tree
Showing 104 changed files with 2,937,302 additions and 0 deletions.
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
## Different Projects/Usecases leveraging Apache Spark

- [Building a recommendation system with Spark ML and Elasticsearch](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/recommendation_system_spark_es)
- [Exploratory data analysis of Covid-19 data using Apache Spark and Python](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/covid19_Analysis)
- [Heart disease prediction using Apache Spark ML -Binary Classification](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/heart_disease_classification)
- [Mutual fund analysis and ranking](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/mutual_fund_data_analysis)
- [News Article Classification](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/news_aggregator)
- [Titanic Survivor prediction](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/titanic_analysis)

## My Blogs about spark and ML

- [Medium](https://medium.com/@lijoabraham1234)
- [Imaginea](https://blog.imaginea.com/author/lijo-abraham/)


## To run the jupyter notebook

```
cd to notebook
jupyter notebook
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Covid19 India Data Analysis using Spark"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-05T10:36:14.213656Z",
"start_time": "2020-08-05T10:36:14.179810Z"
}
},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <p><b>SparkSession - in-memory</b></p>\n",
" \n",
" <div>\n",
" <p><b>SparkContext</b></p>\n",
"\n",
" <p><a href=\"http://192.168.0.49:4042\">Spark UI</a></p>\n",
"\n",
" <dl>\n",
" <dt>Version</dt>\n",
" <dd><code>v3.0.0</code></dd>\n",
" <dt>Master</dt>\n",
" <dd><code>local[*]</code></dd>\n",
" <dt>AppName</dt>\n",
" <dd><code>covid19_India_Data_Analysis</code></dd>\n",
" </dl>\n",
" </div>\n",
" \n",
" </div>\n",
" "
],
"text/plain": [
"<pyspark.sql.session.SparkSession at 0x7fccc95b2048>"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pyspark\n",
"from pyspark.sql import *\n",
"spark = SparkSession.builder.appName(\"covid19_India_Data_Analysis\").getOrCreate()\n",
"spark"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-05T12:08:36.833663Z",
"start_time": "2020-08-05T12:08:36.513877Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------+---------+---------+------+------+\n",
"| State|Confirmed|Recovered|Deaths|Active|\n",
"+-----------+---------+---------+------+------+\n",
"| Total| 1910681| 1282917| 39856|587459|\n",
"|Maharashtra| 457956| 299356| 16142|142151|\n",
"+-----------+---------+---------+------+------+\n",
"only showing top 2 rows\n",
"\n"
]
}
],
"source": [
"df = spark.read.csv('./data/state_wise.csv', escape='\"', header=True, \n",
" inferSchema=True, multiLine=True)\n",
"df = df.withColumnRenamed(\"State_Notes\\r\", \"State_Notes\")\n",
"data = df.select(df.columns[:5])\n",
"data.show(2)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"ExecuteTime": {
"end_time": "2020-08-05T11:36:50.690888Z",
"start_time": "2020-08-05T11:36:50.682836Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"root\n",
" |-- State: string (nullable = true)\n",
" |-- Confirmed: integer (nullable = true)\n",
" |-- Recovered: integer (nullable = true)\n",
" |-- Deaths: integer (nullable = true)\n",
" |-- Active: integer (nullable = true)\n",
" |-- Last_Updated_Time: string (nullable = true)\n",
" |-- Migrated_Other: integer (nullable = true)\n",
" |-- State_code: string (nullable = true)\n",
" |-- Delta_Confirmed: integer (nullable = true)\n",
" |-- Delta_Recovered: integer (nullable = true)\n",
" |-- Delta_Deaths: integer (nullable = true)\n",
" |-- State_Notes: string (nullable = true)\n",
"\n"
]
}
],
"source": [
"df.printSchema()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"ExecuteTime": {
"start_time": "2020-08-05T12:17:58.339Z"
}
},
"outputs": [],
"source": [
"import seaborn as sns\n",
"sns.set_context('talk')\n",
"\n",
"\n",
"fig, ax = plt.subplots(figsize=(12, 8))\n",
"\n",
"# Our x-axis. We basically just want a list\n",
"# of numbers from zero with a value for each\n",
"# of our jobs.\n",
"x = np.arange(len(df.job.unique()))\n",
"\n",
"# Define bar width. We need this to offset the second bar.\n",
"bar_width = 0.4\n",
"\n",
"b1 = ax.bar(x, df['Confirmed'], 'count'],\n",
" width=bar_width, label='Men')\n",
"# Same thing, but offset the x.\n",
"b2 = ax.bar(x + bar_width, df.loc[df['sex'] == 'women', 'count'],\n",
" width=bar_width, label='Women')\n",
"\n",
"# Fix the x-axes.\n",
"ax.set_xticks(x + bar_width / 2)\n",
"ax.set_xticklabels(df.job.unique())\n",
"\n",
"# Add legend.\n",
"ax.legend()\n",
"\n",
"# Axis styling.\n",
"ax.spines['top'].set_visible(False)\n",
"ax.spines['right'].set_visible(False)\n",
"ax.spines['left'].set_visible(False)\n",
"ax.spines['bottom'].set_color('#DDDDDD')\n",
"ax.tick_params(bottom=False, left=False)\n",
"ax.set_axisbelow(True)\n",
"ax.yaxis.grid(True, color='#EEEEEE')\n",
"ax.xaxis.grid(False)\n",
"\n",
"# Add axis and chart labels.\n",
"ax.set_xlabel('Job', labelpad=15)\n",
"ax.set_ylabel('# Employed', labelpad=15)\n",
"ax.set_title('Employed Workers by Gender for Select Jobs', pad=15)\n",
"\n",
"fig.tight_layout()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}
5 changes: 5 additions & 0 deletions covid19_Analysis/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Exploratory data analysis of Covid-19 data using Apache Spark and Python

## Full Analysis and explanation can be found in medium or imaginea blog.
- [Medium link](https://medium.com/@lijoabraham1234/exploratory-data-analysis-of-covid-19-data-using-apache-spark-and-python-c62300d67595)
- [Imaginea link](https://blog.imaginea.com/exploratory-data-analysis-of-covid-19-data-using-apache-spark-and-python/)
Loading

0 comments on commit b3efdac

Please sign in to comment.