Skip to content

Commit

Permalink
kaggle house price submission
Browse files Browse the repository at this point in the history
  • Loading branch information
Ritwick-Roy committed Oct 2, 2021
1 parent d236b64 commit f9c7713
Show file tree
Hide file tree
Showing 8 changed files with 6,147 additions and 0 deletions.
6 changes: 6 additions & 0 deletions house-prices-ML/.ipynb_checkpoints/Untitled-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
6 changes: 6 additions & 0 deletions house-prices-ML/.ipynb_checkpoints/Untitled1-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
164 changes: 164 additions & 0 deletions house-prices-ML/Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "8b324b6b",
"metadata": {},
"source": [
"# Import the required tools and libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "635df08c",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.metrics import mean_absolute_error"
]
},
{
"cell_type": "markdown",
"id": "188c4163",
"metadata": {},
"source": [
"# Load the data and remove irrelevant data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "10aa0c7f",
"metadata": {},
"outputs": [],
"source": [
"# Read the data\n",
"X = pd.read_csv('./input/train.csv', index_col='Id') \n",
"X_test = pd.read_csv('./input/test.csv', index_col='Id')\n",
"\n",
"# Remove rows with missing target, separate target from predictors\n",
"X.dropna(axis=0, subset=['SalePrice'], inplace=True)\n",
"y = X.SalePrice\n",
"X.drop(['SalePrice'], axis=1, inplace=True)\n",
"\n",
"# To keep things simple, we'll drop columns with missing values\n",
"cols_with_missing = [col for col in X.columns if X[col].isnull().any()] \n",
"X.drop(cols_with_missing, axis=1, inplace=True)\n",
"X_test.drop(cols_with_missing, axis=1, inplace=True)"
]
},
{
"cell_type": "markdown",
"id": "c6e1b1aa",
"metadata": {},
"source": [
"# Preprocess data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "79d8d4a9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Categorical columns that will be one-hot encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']\n",
"\n",
"Categorical columns that will be dropped from the dataset: ['Neighborhood', 'Exterior2nd', 'Exterior1st']\n"
]
}
],
"source": [
"object_cols = [col for col in X.columns if X[col].dtype == \"object\"]\n",
"# Columns that will be one-hot encoded\n",
"low_cardinality_cols = [col for col in object_cols if X[col].nunique() < 10]\n",
"# Columns that will be dropped from the dataset\n",
"high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))\n",
"\n",
"print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)\n",
"print('\\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)\n",
"\n",
"nullCol=[col for col in X_test.columns if X_test[col].isnull().any()]\n",
"for col in nullCol: \n",
" if X_test[col].dtype=='object':\n",
" most_frequent=max(set(list(X[col])),key=list(X[col]).count) \n",
" X_test[col]=X_test[col].fillna(most_frequent)\n",
" else:\n",
" X_test[col]=X_test[col].fillna(X[col].median())\n",
"\n",
"one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)\n",
"OH_cols_train = pd.DataFrame(one_hot_encoder.fit_transform(X[low_cardinality_cols]))\n",
"OH_cols_test = pd.DataFrame(one_hot_encoder.transform(X_test[low_cardinality_cols]))\n",
"\n",
"OH_cols_train.index = X.index\n",
"OH_cols_test.index = X_test.index\n",
"\n",
"num_X_train = X.drop(object_cols, axis=1)\n",
"num_X_test = X_test.drop(object_cols, axis=1)\n",
"\n",
"OH_X_train = pd.concat([OH_cols_train,num_X_train], axis=1)\n",
"OH_X_test = pd.concat([OH_cols_test,num_X_test], axis=1)"
]
},
{
"cell_type": "markdown",
"id": "5b7ece15",
"metadata": {},
"source": [
"# Fit data on model and predict and save"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c16eefe2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SUCCESS!\n"
]
}
],
"source": [
"my_model = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)\n",
"my_model.fit(OH_X_train , y)\n",
"preds_test=my_model.predict(OH_X_test)\n",
"output = pd.DataFrame({'Id': X_test.index,'SalePrice': preds_test})\n",
"output.to_csv('submission.csv', index=False)\n",
"print(\"SUCCESS!\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
130 changes: 130 additions & 0 deletions house-prices-ML/Untitled1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "6dca7e38",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"from xgboost import XGBRegressor"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "8ed5f719",
"metadata": {},
"outputs": [],
"source": [
"# Read the data\n",
"X = pd.read_csv('./input/train.csv', index_col='Id') \n",
"X_test = pd.read_csv('./input/test.csv', index_col='Id')\n",
"\n",
"# Remove rows with missing target, separate target from predictors\n",
"X.dropna(axis=0, subset=['SalePrice'], inplace=True)\n",
"y = X.SalePrice\n",
"X.drop(['SalePrice'], axis=1, inplace=True)\n",
"\n",
"# To keep things simple, we'll drop columns with missing values\n",
"cols_with_missing = [col for col in X.columns if X[col].isnull().any()] \n",
"X.drop(cols_with_missing, axis=1, inplace=True)\n",
"X_test.drop(cols_with_missing, axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b8c65529",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Categorical columns that will be one-hot encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']\n",
"\n",
"Categorical columns that will be dropped from the dataset: ['Exterior2nd', 'Neighborhood', 'Exterior1st']\n"
]
}
],
"source": [
"object_cols = [col for col in X.columns if X[col].dtype == \"object\"]\n",
"# Columns that will be one-hot encoded\n",
"low_cardinality_cols = [col for col in object_cols if X[col].nunique() < 10]\n",
"# Columns that will be dropped from the dataset\n",
"high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))\n",
"\n",
"print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)\n",
"print('\\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)\n",
"\n",
"nullCol=[col for col in X_test.columns if X_test[col].isnull().any()]\n",
"for col in nullCol: \n",
" if X_test[col].dtype=='object':\n",
" most_frequent=max(set(list(X[col])),key=list(X[col]).count) \n",
" X_test[col]=X_test[col].fillna(most_frequent)\n",
" else:\n",
" X_test[col]=X_test[col].fillna(X[col].median())\n",
"\n",
"one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)\n",
"OH_cols_train = pd.DataFrame(one_hot_encoder.fit_transform(X[low_cardinality_cols]))\n",
"OH_cols_test = pd.DataFrame(one_hot_encoder.transform(X_test[low_cardinality_cols]))\n",
"\n",
"OH_cols_train.index = X.index\n",
"OH_cols_test.index = X_test.index\n",
"\n",
"num_X_train = X.drop(object_cols, axis=1)\n",
"num_X_test = X_test.drop(object_cols, axis=1)\n",
"\n",
"OH_X_train = pd.concat([OH_cols_train,num_X_train], axis=1)\n",
"OH_X_test = pd.concat([OH_cols_test,num_X_test], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "da44fb2a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SUCCESS!\n"
]
}
],
"source": [
"model=XGBRegressor(n_estimators=200,learning_rate=0.005)\n",
"model.fit(OH_X_train,y)\n",
"preds_test=model.predict(OH_X_test)\n",
"output = pd.DataFrame({'Id': X_test.index,'SalePrice': preds_test})\n",
"output.to_csv('submission1.csv', index=False)\n",
"print(\"SUCCESS!\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading

0 comments on commit f9c7713

Please sign in to comment.