-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
1 addition
and
53 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"cells":[{"cell_type":"markdown","metadata":{"_cell_guid":"c53980f3-43e6-4309-b5b3-8c387c94c2ef","_uuid":"ac363b82-57c3-4cb3-87ed-d2c3d463e1a5","trusted":true},"source":["# KAIM Week 8 and 9 Challenges"]},{"cell_type":"markdown","metadata":{"_cell_guid":"9597c964-28b3-47e3-9f51-0691092476ce","_uuid":"977dc833-b4c5-441a-9ca2-d7e8e86bedb1","trusted":true},"source":["## **Task 2: MOdel Building**"]},{"cell_type":"markdown","metadata":{"_cell_guid":"87a00a83-d488-4a01-a3c8-2d2d9f956d41","_uuid":"7ffdbba4-f7f1-4b89-92ea-16cc52919bc6","trusted":true},"source":["## Import Necessary Libraries"]},{"cell_type":"code","execution_count":1,"metadata":{"_cell_guid":"98500711-7c8e-4a73-a926-ad743b2ca99a","_uuid":"05984005-6423-4192-8e7f-856e08148d1f","collapsed":false,"execution":{"iopub.execute_input":"2024-10-18T11:28:10.886046Z","iopub.status.busy":"2024-10-18T11:28:10.885064Z","iopub.status.idle":"2024-10-18T11:28:12.528653Z","shell.execute_reply":"2024-10-18T11:28:12.527539Z","shell.execute_reply.started":"2024-10-18T11:28:10.886001Z"},"jupyter":{"outputs_hidden":false},"trusted":true},"outputs":[],"source":["# Import necessary libraries\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","from sklearn.preprocessing import StandardScaler, LabelEncoder\n","from sklearn.impute import SimpleImputer\n","from sklearn.model_selection import train_test_split\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.ensemble import RandomForestClassifier\n","from xgboost import XGBClassifier\n","from sklearn.metrics import classification_report\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.neural_network import MLPClassifier\n","from tensorflow.keras.models import Sequential\n","from tensorflow.keras.layers import Dense, Conv2D, Flatten, SimpleRNN, LSTM\n","import mlflow\n","import mlflow.sklearn\n","import mlflow.xgboost\n","import warnings\n","from imblearn.combine import SMOTETomek\n","\n","warnings.filterwarnings('ignore')\n","\n","# Set plot style for better visuals\n","sns.set(style=\"whitegrid\")"]},{"cell_type":"markdown","metadata":{"_cell_guid":"59a324ba-900b-40da-8f79-ee8f7957ef5b","_uuid":"8c78d907-d277-48a3-951f-ee7f01b6a7a7","trusted":true},"source":["## Load Datasets"]},{"cell_type":"code","execution_count":2,"metadata":{"_cell_guid":"98601976-817f-413e-ad3a-8c7eedb11606","_uuid":"b72ec9f6-c664-4339-8d29-555121dbd61f","collapsed":false,"execution":{"iopub.execute_input":"2024-10-18T11:28:15.198096Z","iopub.status.busy":"2024-10-18T11:28:15.197361Z","iopub.status.idle":"2024-10-18T11:28:20.098319Z","shell.execute_reply":"2024-10-18T11:28:20.097043Z","shell.execute_reply.started":"2024-10-18T11:28:15.198047Z"},"jupyter":{"outputs_hidden":false},"trusted":true},"outputs":[],"source":["# Load the datasets\n","fraud_data = pd.read_csv('../data/cleaned_data_1.csv')\n","credit_data = pd.read_csv('../data/cleaned_data_2.csv')"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[{"data":{"text/plain":["((138846, 15), (283726, 31))"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["fraud_data.shape, credit_data.shape"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"data":{"text/plain":["(Index(['user_id', 'purchase_value', 'source', 'browser', 'sex', 'age',\n"," 'ip_address', 'Class', 'country', 'lower_bound_ip_addres',\n"," 'upper_bound_ip_adress', 'signup_purchase_diff', 'transaction_count',\n"," 'hour_of_day', 'day_of_week'],\n"," dtype='object'),\n"," Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',\n"," 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',\n"," 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',\n"," 'Class'],\n"," dtype='object'))"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["fraud_data.columns, credit_data.columns"]},{"cell_type":"markdown","metadata":{"_cell_guid":"a1c436e7-42e4-474c-8a8f-b00592a274ff","_uuid":"79aa1d6d-a15f-4224-815b-4344da833bbf","trusted":true},"source":["## Model Building"]},{"cell_type":"code","execution_count":5,"metadata":{"_cell_guid":"a49734b2-8a44-42d2-8b30-bd5a916c17eb","_uuid":"ab90e452-bf1c-4287-a72e-c1599293b5b7","collapsed":false,"execution":{"iopub.execute_input":"2024-10-18T12:12:29.677630Z","iopub.status.busy":"2024-10-18T12:12:29.677086Z","iopub.status.idle":"2024-10-18T12:13:18.305808Z","shell.execute_reply":"2024-10-18T12:13:18.304575Z","shell.execute_reply.started":"2024-10-18T12:12:29.677583Z"},"jupyter":{"outputs_hidden":false},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["(array([0, 1], dtype=int64), array([125849, 12997], dtype=int64))\n","(array([0, 1], dtype=int64), array([283253, 473], dtype=int64))\n"]}],"source":["# Prepare data for the model (e-commerce)\n","X1 = fraud_data.drop(columns=['Class'])\n","X2 = credit_data.drop(columns=['Class'])\n","\n","y1 = fraud_data['Class']\n","y2 = credit_data['Class']\n","\n","print(np.unique(y1, return_counts=True))\n","print(np.unique(y2, return_counts=True))"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[],"source":["X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)\n","\n","X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)\n"]},{"cell_type":"markdown","metadata":{},"source":["## Class Balancing with SMOTE"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["smt = SMOTETomek(random_state=42)\n","X_train1, y_train1 = smt.fit_resample(X_train1, y_train1)\n","X_train2, y_train2 = smt.fit_resample(X_train2, y_train2)\n","\n","print(np.unique(X_train1, return_counts=True))\n","print(np.unique(X_train2, return_counts=True))\n","print(np.unique(y_train1, return_counts=True))\n","print(np.unique(y_train2, return_counts=True))"]},{"cell_type":"markdown","metadata":{},"source":["### Logistic Regression"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" precision recall f1-score support\n","\n"," 0 0.91 1.00 0.95 25193\n"," 1 0.00 0.00 0.00 2577\n","\n"," accuracy 0.91 27770\n"," macro avg 0.45 0.50 0.48 27770\n","weighted avg 0.82 0.91 0.86 27770\n","\n"]}],"source":["log_reg = LogisticRegression(C=1, solver='liblinear')\n","log_reg.fit(X_train, y_train)\n","y_pred_log_reg = log_reg.predict(X_test)\n","print(classification_report(y_test, y_pred_log_reg))"]},{"cell_type":"markdown","metadata":{},"source":["### Random Forest"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" precision recall f1-score support\n","\n"," 0 0.96 1.00 0.98 25193\n"," 1 1.00 0.55 0.71 2577\n","\n"," accuracy 0.96 27770\n"," macro avg 0.98 0.77 0.84 27770\n","weighted avg 0.96 0.96 0.95 27770\n","\n"]}],"source":["rf_clf = RandomForestClassifier(n_estimators=30, max_depth=3)\n","rf_clf.fit(X_train, y_train)\n","y_pred_rf = rf_clf.predict(X_test)\n","print(classification_report(y_test, y_pred_rf))"]},{"cell_type":"markdown","metadata":{},"source":["### XGBoost "]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" precision recall f1-score support\n","\n"," 0 0.96 1.00 0.98 25193\n"," 1 0.94 0.55 0.69 2577\n","\n"," accuracy 0.96 27770\n"," macro avg 0.95 0.77 0.84 27770\n","weighted avg 0.95 0.96 0.95 27770\n","\n"]}],"source":["xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')\n","xgb_clf.fit(X_train, y_train)\n","y_pred_xgb = xgb_clf.predict(X_test)\n","print(classification_report(y_test, y_pred_xgb))"]},{"cell_type":"markdown","metadata":{},"source":["## Experiments"]},{"cell_type":"code","execution_count":50,"metadata":{},"outputs":[],"source":["datasets = {\n"," \"fraud_data\": (X_train1, y_train1, X_test1, y_test1),\n"," \"credit_data\": (X_train2, y_train2, X_test2, y_test2),\n","}"]},{"cell_type":"code","execution_count":54,"metadata":{},"outputs":[],"source":["models = [\n"," (\n"," \"Logistic Regression\", \n"," LogisticRegression(C=1, solver='liblinear')\n"," ),\n"," (\n"," \"Random Forest\", \n"," RandomForestClassifier(n_estimators=30, max_depth=3)\n"," ),\n"," (\n"," \"XGBClassifier\",\n"," XGBClassifier(use_label_encoder=False, eval_metric='logloss')\n"," )\n","]"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["2024/10/20 13:43:16 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 13:43:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: http://localhost:5000/#/experiments/935012191474515353/runs/66416527b5ad44a0be0886ed9e43b658.\n","2024/10/20 13:43:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191474515353.\n","2024/10/20 13:43:20 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 13:43:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://localhost:5000/#/experiments/935012191474515353/runs/3c3923af256d4ca4947318c3e70a6fbf.\n","2024/10/20 13:43:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191474515353.\n","2024/10/20 13:43:25 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 13:43:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBClassifier at: http://localhost:5000/#/experiments/935012191474515353/runs/019f76c5419a4e24a15a654ad2358b1c.\n","2024/10/20 13:43:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191474515353.\n","2024/10/20 13:43:28 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 13:43:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBClassifier With SMOTE at: http://localhost:5000/#/experiments/935012191474515353/runs/b89994c7190a441495941622db7bf9ac.\n","2024/10/20 13:43:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191474515353.\n"]}],"source":["# Initialize MLflow\n","mlflow.set_experiment(\"Fraud Detection Models - Single Dataset\")\n","mlflow.set_tracking_uri(\"http://localhost:5000\")\n","\n","reports = []\n","\n","for model_name, model, train_set, test_set in models:\n"," X_train = train_set[0]\n"," y_train = train_set[1]\n"," X_test = test_set[0]\n"," y_test = test_set[1]\n"," \n"," model.fit(X_train, y_train)\n"," y_pred = model.predict(X_test)\n"," report = classification_report(y_test, y_pred, output_dict=True)\n"," reports.append(report)\n","\n","for i, element in enumerate(models):\n"," model_name = element[0]\n"," model = element[1]\n"," report = reports[i]\n"," \n"," with mlflow.start_run(run_name=model_name): \n"," mlflow.log_param(\"model\", model_name)\n"," mlflow.log_metric('accuracy', report['accuracy'])\n"," mlflow.log_metric('recall_class_1', report['1']['recall'])\n"," mlflow.log_metric('recall_class_0', report['0']['recall'])\n"," mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score']) \n"," \n"," if \"XGB\" in model_name:\n"," mlflow.xgboost.log_model(model, \"model\")\n"," else:\n"," mlflow.sklearn.log_model(model, \"model\") "]},{"cell_type":"code","execution_count":55,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["2024/10/20 20:16:21 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression_fraud_data at: http://localhost:5000/#/experiments/478268722598582565/runs/8273f777699647a39535d873391b9d13.\n","2024/10/20 20:16:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:24 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest_fraud_data at: http://localhost:5000/#/experiments/478268722598582565/runs/65913aaab7ce49948b90f6752b771312.\n","2024/10/20 20:16:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:27 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBClassifier_fraud_data at: http://localhost:5000/#/experiments/478268722598582565/runs/e262b2c32c8442b2a720a086f36a2f63.\n","2024/10/20 20:16:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:52 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression_credit_data at: http://localhost:5000/#/experiments/478268722598582565/runs/37abcc1c28b54206bffc2f7695ca9d49.\n","2024/10/20 20:16:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:56 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest_credit_data at: http://localhost:5000/#/experiments/478268722598582565/runs/afef239a5b3845b082ab9368d9621edd.\n","2024/10/20 20:16:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:59 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBClassifier_credit_data at: http://localhost:5000/#/experiments/478268722598582565/runs/bcfb79f52acc4f82bf61298763c208df.\n","2024/10/20 20:16:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n"]}],"source":["# Initialize MLflow\n","mlflow.set_experiment(\"Fraud Detection Models - 2 Datasets\")\n","mlflow.set_tracking_uri(\"http://localhost:5000\")\n","\n","# Iterate through datasets\n","for dataset_name, (X_train, y_train, X_test, y_test) in datasets.items():\n"," reports = []\n","\n"," # Train each model on the current dataset\n"," for model_name, model in models:\n"," model.fit(X_train, y_train)\n"," y_pred = model.predict(X_test)\n"," report = classification_report(y_test, y_pred, output_dict=True)\n"," reports.append(report)\n","\n"," # Log each model's performance metrics to MLflow\n"," for i, (model_name, model) in enumerate(models):\n"," report = reports[i]\n","\n"," with mlflow.start_run(run_name=f\"{model_name}_{dataset_name}\"):\n"," mlflow.log_param(\"model\", model_name)\n"," mlflow.log_param(\"dataset\", dataset_name)\n"," mlflow.log_metric('accuracy', report['accuracy'])\n"," mlflow.log_metric('recall_class_1', report['1']['recall'])\n"," mlflow.log_metric('recall_class_0', report['0']['recall'])\n"," mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])\n","\n"," # Log the model using the appropriate MLflow method\n"," if \"XGBoost\" in model_name:\n"," mlflow.xgboost.log_model(model, \"model\")\n"," else:\n"," mlflow.sklearn.log_model(model, \"model\")\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"datasetId":5896976,"sourceId":9653739,"sourceType":"datasetVersion"}],"dockerImageVersionId":30786,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":".week89","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.0"}},"nbformat":4,"nbformat_minor":4} |
This file was deleted.
Oops, something went wrong.