kaggle house price submission

prathamesh-borse · Oct 2, 2021 · f9c7713 · f9c7713
1 parent d236b64
commit f9c7713
Show file tree

Hide file tree

Showing 8 changed files with 6,147 additions and 0 deletions.
diff --git a/house-prices-ML/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/house-prices-ML/.ipynb_checkpoints/Untitled-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/house-prices-ML/.ipynb_checkpoints/Untitled1-checkpoint.ipynb b/house-prices-ML/.ipynb_checkpoints/Untitled1-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/house-prices-ML/Untitled.ipynb b/house-prices-ML/Untitled.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8b324b6b",
+   "metadata": {},
+   "source": [
+    "# Import the required tools and libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "635df08c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.ensemble import RandomForestRegressor\n",
+    "from sklearn.metrics import mean_absolute_error"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "188c4163",
+   "metadata": {},
+   "source": [
+    "# Load the data and remove irrelevant data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "10aa0c7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read the data\n",
+    "X = pd.read_csv('./input/train.csv', index_col='Id') \n",
+    "X_test = pd.read_csv('./input/test.csv', index_col='Id')\n",
+    "\n",
+    "# Remove rows with missing target, separate target from predictors\n",
+    "X.dropna(axis=0, subset=['SalePrice'], inplace=True)\n",
+    "y = X.SalePrice\n",
+    "X.drop(['SalePrice'], axis=1, inplace=True)\n",
+    "\n",
+    "# To keep things simple, we'll drop columns with missing values\n",
+    "cols_with_missing = [col for col in X.columns if X[col].isnull().any()] \n",
+    "X.drop(cols_with_missing, axis=1, inplace=True)\n",
+    "X_test.drop(cols_with_missing, axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6e1b1aa",
+   "metadata": {},
+   "source": [
+    "# Preprocess data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "79d8d4a9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Categorical columns that will be one-hot encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']\n",
+      "\n",
+      "Categorical columns that will be dropped from the dataset: ['Neighborhood', 'Exterior2nd', 'Exterior1st']\n"
+     ]
+    }
+   ],
+   "source": [
+    "object_cols = [col for col in X.columns if X[col].dtype == \"object\"]\n",
+    "# Columns that will be one-hot encoded\n",
+    "low_cardinality_cols = [col for col in object_cols if X[col].nunique() < 10]\n",
+    "# Columns that will be dropped from the dataset\n",
+    "high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))\n",
+    "\n",
+    "print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)\n",
+    "print('\\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)\n",
+    "\n",
+    "nullCol=[col for col in X_test.columns if X_test[col].isnull().any()]\n",
+    "for col in nullCol: \n",
+    "    if X_test[col].dtype=='object':\n",
+    "        most_frequent=max(set(list(X[col])),key=list(X[col]).count) \n",
+    "        X_test[col]=X_test[col].fillna(most_frequent)\n",
+    "    else:\n",
+    "        X_test[col]=X_test[col].fillna(X[col].median())\n",
+    "\n",
+    "one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)\n",
+    "OH_cols_train = pd.DataFrame(one_hot_encoder.fit_transform(X[low_cardinality_cols]))\n",
+    "OH_cols_test = pd.DataFrame(one_hot_encoder.transform(X_test[low_cardinality_cols]))\n",
+    "\n",
+    "OH_cols_train.index = X.index\n",
+    "OH_cols_test.index = X_test.index\n",
+    "\n",
+    "num_X_train = X.drop(object_cols, axis=1)\n",
+    "num_X_test = X_test.drop(object_cols, axis=1)\n",
+    "\n",
+    "OH_X_train = pd.concat([OH_cols_train,num_X_train], axis=1)\n",
+    "OH_X_test = pd.concat([OH_cols_test,num_X_test], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5b7ece15",
+   "metadata": {},
+   "source": [
+    "# Fit data on model and predict and save"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c16eefe2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SUCCESS!\n"
+     ]
+    }
+   ],
+   "source": [
+    "my_model = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)\n",
+    "my_model.fit(OH_X_train , y)\n",
+    "preds_test=my_model.predict(OH_X_test)\n",
+    "output = pd.DataFrame({'Id': X_test.index,'SalePrice': preds_test})\n",
+    "output.to_csv('submission.csv', index=False)\n",
+    "print(\"SUCCESS!\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/house-prices-ML/Untitled1.ipynb b/house-prices-ML/Untitled1.ipynb
@@ -0,0 +1,130 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6dca7e38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from xgboost import XGBRegressor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "8ed5f719",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read the data\n",
+    "X = pd.read_csv('./input/train.csv', index_col='Id') \n",
+    "X_test = pd.read_csv('./input/test.csv', index_col='Id')\n",
+    "\n",
+    "# Remove rows with missing target, separate target from predictors\n",
+    "X.dropna(axis=0, subset=['SalePrice'], inplace=True)\n",
+    "y = X.SalePrice\n",
+    "X.drop(['SalePrice'], axis=1, inplace=True)\n",
+    "\n",
+    "# To keep things simple, we'll drop columns with missing values\n",
+    "cols_with_missing = [col for col in X.columns if X[col].isnull().any()] \n",
+    "X.drop(cols_with_missing, axis=1, inplace=True)\n",
+    "X_test.drop(cols_with_missing, axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b8c65529",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Categorical columns that will be one-hot encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']\n",
+      "\n",
+      "Categorical columns that will be dropped from the dataset: ['Exterior2nd', 'Neighborhood', 'Exterior1st']\n"
+     ]
+    }
+   ],
+   "source": [
+    "object_cols = [col for col in X.columns if X[col].dtype == \"object\"]\n",
+    "# Columns that will be one-hot encoded\n",
+    "low_cardinality_cols = [col for col in object_cols if X[col].nunique() < 10]\n",
+    "# Columns that will be dropped from the dataset\n",
+    "high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))\n",
+    "\n",
+    "print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)\n",
+    "print('\\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)\n",
+    "\n",
+    "nullCol=[col for col in X_test.columns if X_test[col].isnull().any()]\n",
+    "for col in nullCol: \n",
+    "    if X_test[col].dtype=='object':\n",
+    "        most_frequent=max(set(list(X[col])),key=list(X[col]).count) \n",
+    "        X_test[col]=X_test[col].fillna(most_frequent)\n",
+    "    else:\n",
+    "        X_test[col]=X_test[col].fillna(X[col].median())\n",
+    "\n",
+    "one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)\n",
+    "OH_cols_train = pd.DataFrame(one_hot_encoder.fit_transform(X[low_cardinality_cols]))\n",
+    "OH_cols_test = pd.DataFrame(one_hot_encoder.transform(X_test[low_cardinality_cols]))\n",
+    "\n",
+    "OH_cols_train.index = X.index\n",
+    "OH_cols_test.index = X_test.index\n",
+    "\n",
+    "num_X_train = X.drop(object_cols, axis=1)\n",
+    "num_X_test = X_test.drop(object_cols, axis=1)\n",
+    "\n",
+    "OH_X_train = pd.concat([OH_cols_train,num_X_train], axis=1)\n",
+    "OH_X_test = pd.concat([OH_cols_test,num_X_test], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "da44fb2a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "SUCCESS!\n"
+     ]
+    }
+   ],
+   "source": [
+    "model=XGBRegressor(n_estimators=200,learning_rate=0.005)\n",
+    "model.fit(OH_X_train,y)\n",
+    "preds_test=model.predict(OH_X_test)\n",
+    "output = pd.DataFrame({'Id': X_test.index,'SalePrice': preds_test})\n",
+    "output.to_csv('submission1.csv', index=False)\n",
+    "print(\"SUCCESS!\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}