forked from lijoabraham-zz/spark-playground
-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit b3efdac
Showing
104 changed files
with
2,937,302 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
## Different Projects/Usecases leveraging Apache Spark | ||
|
||
- [Building a recommendation system with Spark ML and Elasticsearch](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/recommendation_system_spark_es) | ||
- [Exploratory data analysis of Covid-19 data using Apache Spark and Python](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/covid19_Analysis) | ||
- [Heart disease prediction using Apache Spark ML -Binary Classification](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/heart_disease_classification) | ||
- [Mutual fund analysis and ranking](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/mutual_fund_data_analysis) | ||
- [News Article Classification](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/news_aggregator) | ||
- [Titanic Survivor prediction](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/titanic_analysis) | ||
|
||
## My Blogs about spark and ML | ||
|
||
- [Medium](https://medium.com/@lijoabraham1234) | ||
- [Imaginea](https://blog.imaginea.com/author/lijo-abraham/) | ||
|
||
|
||
## To run the jupyter notebook | ||
|
||
``` | ||
cd to notebook | ||
jupyter notebook |
235 changes: 235 additions & 0 deletions
235
covid19_Analysis/.ipynb_checkpoints/covid19India_Data_Analysis-checkpoint.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,235 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Covid19 India Data Analysis using Spark" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 17, | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2020-08-05T10:36:14.213656Z", | ||
"start_time": "2020-08-05T10:36:14.179810Z" | ||
} | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"\n", | ||
" <div>\n", | ||
" <p><b>SparkSession - in-memory</b></p>\n", | ||
" \n", | ||
" <div>\n", | ||
" <p><b>SparkContext</b></p>\n", | ||
"\n", | ||
" <p><a href=\"http://192.168.0.49:4042\">Spark UI</a></p>\n", | ||
"\n", | ||
" <dl>\n", | ||
" <dt>Version</dt>\n", | ||
" <dd><code>v3.0.0</code></dd>\n", | ||
" <dt>Master</dt>\n", | ||
" <dd><code>local[*]</code></dd>\n", | ||
" <dt>AppName</dt>\n", | ||
" <dd><code>covid19_India_Data_Analysis</code></dd>\n", | ||
" </dl>\n", | ||
" </div>\n", | ||
" \n", | ||
" </div>\n", | ||
" " | ||
], | ||
"text/plain": [ | ||
"<pyspark.sql.session.SparkSession at 0x7fccc95b2048>" | ||
] | ||
}, | ||
"execution_count": 17, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"import pyspark\n", | ||
"from pyspark.sql import *\n", | ||
"spark = SparkSession.builder.appName(\"covid19_India_Data_Analysis\").getOrCreate()\n", | ||
"spark" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 66, | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2020-08-05T12:08:36.833663Z", | ||
"start_time": "2020-08-05T12:08:36.513877Z" | ||
} | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"+-----------+---------+---------+------+------+\n", | ||
"| State|Confirmed|Recovered|Deaths|Active|\n", | ||
"+-----------+---------+---------+------+------+\n", | ||
"| Total| 1910681| 1282917| 39856|587459|\n", | ||
"|Maharashtra| 457956| 299356| 16142|142151|\n", | ||
"+-----------+---------+---------+------+------+\n", | ||
"only showing top 2 rows\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"df = spark.read.csv('./data/state_wise.csv', escape='\"', header=True, \n", | ||
" inferSchema=True, multiLine=True)\n", | ||
"df = df.withColumnRenamed(\"State_Notes\\r\", \"State_Notes\")\n", | ||
"data = df.select(df.columns[:5])\n", | ||
"data.show(2)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 62, | ||
"metadata": { | ||
"ExecuteTime": { | ||
"end_time": "2020-08-05T11:36:50.690888Z", | ||
"start_time": "2020-08-05T11:36:50.682836Z" | ||
} | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"root\n", | ||
" |-- State: string (nullable = true)\n", | ||
" |-- Confirmed: integer (nullable = true)\n", | ||
" |-- Recovered: integer (nullable = true)\n", | ||
" |-- Deaths: integer (nullable = true)\n", | ||
" |-- Active: integer (nullable = true)\n", | ||
" |-- Last_Updated_Time: string (nullable = true)\n", | ||
" |-- Migrated_Other: integer (nullable = true)\n", | ||
" |-- State_code: string (nullable = true)\n", | ||
" |-- Delta_Confirmed: integer (nullable = true)\n", | ||
" |-- Delta_Recovered: integer (nullable = true)\n", | ||
" |-- Delta_Deaths: integer (nullable = true)\n", | ||
" |-- State_Notes: string (nullable = true)\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"df.printSchema()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"ExecuteTime": { | ||
"start_time": "2020-08-05T12:17:58.339Z" | ||
} | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import seaborn as sns\n", | ||
"sns.set_context('talk')\n", | ||
"\n", | ||
"\n", | ||
"fig, ax = plt.subplots(figsize=(12, 8))\n", | ||
"\n", | ||
"# Our x-axis. We basically just want a list\n", | ||
"# of numbers from zero with a value for each\n", | ||
"# of our jobs.\n", | ||
"x = np.arange(len(df.job.unique()))\n", | ||
"\n", | ||
"# Define bar width. We need this to offset the second bar.\n", | ||
"bar_width = 0.4\n", | ||
"\n", | ||
"b1 = ax.bar(x, df['Confirmed'], 'count'],\n", | ||
" width=bar_width, label='Men')\n", | ||
"# Same thing, but offset the x.\n", | ||
"b2 = ax.bar(x + bar_width, df.loc[df['sex'] == 'women', 'count'],\n", | ||
" width=bar_width, label='Women')\n", | ||
"\n", | ||
"# Fix the x-axes.\n", | ||
"ax.set_xticks(x + bar_width / 2)\n", | ||
"ax.set_xticklabels(df.job.unique())\n", | ||
"\n", | ||
"# Add legend.\n", | ||
"ax.legend()\n", | ||
"\n", | ||
"# Axis styling.\n", | ||
"ax.spines['top'].set_visible(False)\n", | ||
"ax.spines['right'].set_visible(False)\n", | ||
"ax.spines['left'].set_visible(False)\n", | ||
"ax.spines['bottom'].set_color('#DDDDDD')\n", | ||
"ax.tick_params(bottom=False, left=False)\n", | ||
"ax.set_axisbelow(True)\n", | ||
"ax.yaxis.grid(True, color='#EEEEEE')\n", | ||
"ax.xaxis.grid(False)\n", | ||
"\n", | ||
"# Add axis and chart labels.\n", | ||
"ax.set_xlabel('Job', labelpad=15)\n", | ||
"ax.set_ylabel('# Employed', labelpad=15)\n", | ||
"ax.set_title('Employed Workers by Gender for Select Jobs', pad=15)\n", | ||
"\n", | ||
"fig.tight_layout()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.9" | ||
}, | ||
"varInspector": { | ||
"cols": { | ||
"lenName": 16, | ||
"lenType": 16, | ||
"lenVar": 40 | ||
}, | ||
"kernels_config": { | ||
"python": { | ||
"delete_cmd_postfix": "", | ||
"delete_cmd_prefix": "del ", | ||
"library": "var_list.py", | ||
"varRefreshCmd": "print(var_dic_list())" | ||
}, | ||
"r": { | ||
"delete_cmd_postfix": ") ", | ||
"delete_cmd_prefix": "rm(", | ||
"library": "var_list.r", | ||
"varRefreshCmd": "cat(var_dic_list()) " | ||
} | ||
}, | ||
"types_to_exclude": [ | ||
"module", | ||
"function", | ||
"builtin_function_or_method", | ||
"instance", | ||
"_Feature" | ||
], | ||
"window_display": false | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 4 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Exploratory data analysis of Covid-19 data using Apache Spark and Python | ||
|
||
## Full Analysis and explanation can be found in medium or imaginea blog. | ||
- [Medium link](https://medium.com/@lijoabraham1234/exploratory-data-analysis-of-covid-19-data-using-apache-spark-and-python-c62300d67595) | ||
- [Imaginea link](https://blog.imaginea.com/exploratory-data-analysis-of-covid-19-data-using-apache-spark-and-python/) |
Oops, something went wrong.