files added

lijoabraham · Nov 10, 2020 · b3efdac · b3efdac
commit b3efdac
Show file tree

Hide file tree

Showing 104 changed files with 2,937,302 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,20 @@
+## Different Projects/Usecases leveraging Apache Spark
+
+- [Building a recommendation system with Spark ML and Elasticsearch](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/recommendation_system_spark_es)
+- [Exploratory data analysis of Covid-19 data using Apache Spark and Python](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/covid19_Analysis)
+- [Heart disease prediction using Apache Spark ML -Binary Classification](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/heart_disease_classification)
+- [Mutual fund analysis and ranking](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/mutual_fund_data_analysis)
+- [News Article Classification](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/news_aggregator)
+- [Titanic Survivor prediction](https://gitlab.pramati.com/lijoa/spark_playground/-/tree/master/titanic_analysis)
+
+## My Blogs about spark and ML
+
+- [Medium](https://medium.com/@lijoabraham1234)
+- [Imaginea](https://blog.imaginea.com/author/lijo-abraham/)
+
+
+## To run the jupyter notebook
+
+```
+ cd to notebook
+ jupyter notebook
diff --git a/covid19_Analysis/.ipynb_checkpoints/covid19India_Data_Analysis-checkpoint.ipynb b/covid19_Analysis/.ipynb_checkpoints/covid19India_Data_Analysis-checkpoint.ipynb
@@ -0,0 +1,235 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Covid19 India Data Analysis using Spark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-08-05T10:36:14.213656Z",
+     "start_time": "2020-08-05T10:36:14.179810Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "            <div>\n",
+       "                <p><b>SparkSession - in-memory</b></p>\n",
+       "                \n",
+       "        <div>\n",
+       "            <p><b>SparkContext</b></p>\n",
+       "\n",
+       "            <p><a href=\"http://192.168.0.49:4042\">Spark UI</a></p>\n",
+       "\n",
+       "            <dl>\n",
+       "              <dt>Version</dt>\n",
+       "                <dd><code>v3.0.0</code></dd>\n",
+       "              <dt>Master</dt>\n",
+       "                <dd><code>local[*]</code></dd>\n",
+       "              <dt>AppName</dt>\n",
+       "                <dd><code>covid19_India_Data_Analysis</code></dd>\n",
+       "            </dl>\n",
+       "        </div>\n",
+       "        \n",
+       "            </div>\n",
+       "        "
+      ],
+      "text/plain": [
+       "<pyspark.sql.session.SparkSession at 0x7fccc95b2048>"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pyspark\n",
+    "from pyspark.sql import *\n",
+    "spark = SparkSession.builder.appName(\"covid19_India_Data_Analysis\").getOrCreate()\n",
+    "spark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-08-05T12:08:36.833663Z",
+     "start_time": "2020-08-05T12:08:36.513877Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+-----------+---------+---------+------+------+\n",
+      "|      State|Confirmed|Recovered|Deaths|Active|\n",
+      "+-----------+---------+---------+------+------+\n",
+      "|      Total|  1910681|  1282917| 39856|587459|\n",
+      "|Maharashtra|   457956|   299356| 16142|142151|\n",
+      "+-----------+---------+---------+------+------+\n",
+      "only showing top 2 rows\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = spark.read.csv('./data/state_wise.csv',  escape='\"', header=True, \n",
+    "               inferSchema=True, multiLine=True)\n",
+    "df = df.withColumnRenamed(\"State_Notes\\r\", \"State_Notes\")\n",
+    "data = df.select(df.columns[:5])\n",
+    "data.show(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2020-08-05T11:36:50.690888Z",
+     "start_time": "2020-08-05T11:36:50.682836Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "root\n",
+      " |-- State: string (nullable = true)\n",
+      " |-- Confirmed: integer (nullable = true)\n",
+      " |-- Recovered: integer (nullable = true)\n",
+      " |-- Deaths: integer (nullable = true)\n",
+      " |-- Active: integer (nullable = true)\n",
+      " |-- Last_Updated_Time: string (nullable = true)\n",
+      " |-- Migrated_Other: integer (nullable = true)\n",
+      " |-- State_code: string (nullable = true)\n",
+      " |-- Delta_Confirmed: integer (nullable = true)\n",
+      " |-- Delta_Recovered: integer (nullable = true)\n",
+      " |-- Delta_Deaths: integer (nullable = true)\n",
+      " |-- State_Notes: string (nullable = true)\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.printSchema()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2020-08-05T12:17:58.339Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import seaborn as sns\n",
+    "sns.set_context('talk')\n",
+    "\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(12, 8))\n",
+    "\n",
+    "# Our x-axis. We basically just want a list\n",
+    "# of numbers from zero with a value for each\n",
+    "# of our jobs.\n",
+    "x = np.arange(len(df.job.unique()))\n",
+    "\n",
+    "# Define bar width. We need this to offset the second bar.\n",
+    "bar_width = 0.4\n",
+    "\n",
+    "b1 = ax.bar(x, df['Confirmed'], 'count'],\n",
+    "            width=bar_width, label='Men')\n",
+    "# Same thing, but offset the x.\n",
+    "b2 = ax.bar(x + bar_width, df.loc[df['sex'] == 'women', 'count'],\n",
+    "            width=bar_width, label='Women')\n",
+    "\n",
+    "# Fix the x-axes.\n",
+    "ax.set_xticks(x + bar_width / 2)\n",
+    "ax.set_xticklabels(df.job.unique())\n",
+    "\n",
+    "# Add legend.\n",
+    "ax.legend()\n",
+    "\n",
+    "# Axis styling.\n",
+    "ax.spines['top'].set_visible(False)\n",
+    "ax.spines['right'].set_visible(False)\n",
+    "ax.spines['left'].set_visible(False)\n",
+    "ax.spines['bottom'].set_color('#DDDDDD')\n",
+    "ax.tick_params(bottom=False, left=False)\n",
+    "ax.set_axisbelow(True)\n",
+    "ax.yaxis.grid(True, color='#EEEEEE')\n",
+    "ax.xaxis.grid(False)\n",
+    "\n",
+    "# Add axis and chart labels.\n",
+    "ax.set_xlabel('Job', labelpad=15)\n",
+    "ax.set_ylabel('# Employed', labelpad=15)\n",
+    "ax.set_title('Employed Workers by Gender for Select Jobs', pad=15)\n",
+    "\n",
+    "fig.tight_layout()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.9"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/covid19_Analysis/README.md b/covid19_Analysis/README.md
@@ -0,0 +1,5 @@
+# Exploratory data analysis of Covid-19 data using Apache Spark and Python
+
+## Full Analysis and explanation can be found in medium or imaginea blog.
+- [Medium link](https://medium.com/@lijoabraham1234/exploratory-data-analysis-of-covid-19-data-using-apache-spark-and-python-c62300d67595)
+- [Imaginea link](https://blog.imaginea.com/exploratory-data-analysis-of-covid-19-data-using-apache-spark-and-python/)