database url removed

tedoaba · Oct 28, 2024 · b389a06 · b389a06
1 parent e9bde72
commit b389a06
Show file tree

Hide file tree

Showing 3 changed files with 1 addition and 53 deletions.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -17,18 +17,6 @@ services:
     volumes:
       - .:/app
 
-  db:
-    image: postgres:17
-    container_name: postgres_db
-    environment:
-      POSTGRES_DB: ${POSTGRES_DB}
-      POSTGRES_USER: ${POSTGRES_USER}
-      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
-    ports:
-      - "5432:5432"
-    volumes:
-      - postgres_data:/var/lib/postgresql/data
-
   css:
     image: node:14
     container_name: css_processor

diff --git a/notebooks/kaim2-week-8-9-task-2-balanced_class.ipynb b/notebooks/kaim2-week-8-9-task-2-balanced_class.ipynb
@@ -0,0 +1 @@
+{"cells":[{"cell_type":"markdown","metadata":{"_cell_guid":"c53980f3-43e6-4309-b5b3-8c387c94c2ef","_uuid":"ac363b82-57c3-4cb3-87ed-d2c3d463e1a5","trusted":true},"source":["# KAIM Week 8 and 9 Challenges"]},{"cell_type":"markdown","metadata":{"_cell_guid":"9597c964-28b3-47e3-9f51-0691092476ce","_uuid":"977dc833-b4c5-441a-9ca2-d7e8e86bedb1","trusted":true},"source":["## **Task 2: MOdel Building**"]},{"cell_type":"markdown","metadata":{"_cell_guid":"87a00a83-d488-4a01-a3c8-2d2d9f956d41","_uuid":"7ffdbba4-f7f1-4b89-92ea-16cc52919bc6","trusted":true},"source":["## Import Necessary Libraries"]},{"cell_type":"code","execution_count":1,"metadata":{"_cell_guid":"98500711-7c8e-4a73-a926-ad743b2ca99a","_uuid":"05984005-6423-4192-8e7f-856e08148d1f","collapsed":false,"execution":{"iopub.execute_input":"2024-10-18T11:28:10.886046Z","iopub.status.busy":"2024-10-18T11:28:10.885064Z","iopub.status.idle":"2024-10-18T11:28:12.528653Z","shell.execute_reply":"2024-10-18T11:28:12.527539Z","shell.execute_reply.started":"2024-10-18T11:28:10.886001Z"},"jupyter":{"outputs_hidden":false},"trusted":true},"outputs":[],"source":["# Import necessary libraries\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","from sklearn.preprocessing import StandardScaler, LabelEncoder\n","from sklearn.impute import SimpleImputer\n","from sklearn.model_selection import train_test_split\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.ensemble import RandomForestClassifier\n","from xgboost import XGBClassifier\n","from sklearn.metrics import classification_report\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.neural_network import MLPClassifier\n","from tensorflow.keras.models import Sequential\n","from tensorflow.keras.layers import Dense, Conv2D, Flatten, SimpleRNN, LSTM\n","import mlflow\n","import mlflow.sklearn\n","import mlflow.xgboost\n","import warnings\n","from imblearn.combine import SMOTETomek\n","\n","warnings.filterwarnings('ignore')\n","\n","# Set plot style for better visuals\n","sns.set(style=\"whitegrid\")"]},{"cell_type":"markdown","metadata":{"_cell_guid":"59a324ba-900b-40da-8f79-ee8f7957ef5b","_uuid":"8c78d907-d277-48a3-951f-ee7f01b6a7a7","trusted":true},"source":["## Load Datasets"]},{"cell_type":"code","execution_count":2,"metadata":{"_cell_guid":"98601976-817f-413e-ad3a-8c7eedb11606","_uuid":"b72ec9f6-c664-4339-8d29-555121dbd61f","collapsed":false,"execution":{"iopub.execute_input":"2024-10-18T11:28:15.198096Z","iopub.status.busy":"2024-10-18T11:28:15.197361Z","iopub.status.idle":"2024-10-18T11:28:20.098319Z","shell.execute_reply":"2024-10-18T11:28:20.097043Z","shell.execute_reply.started":"2024-10-18T11:28:15.198047Z"},"jupyter":{"outputs_hidden":false},"trusted":true},"outputs":[],"source":["# Load the datasets\n","fraud_data = pd.read_csv('../data/cleaned_data_1.csv')\n","credit_data = pd.read_csv('../data/cleaned_data_2.csv')"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[{"data":{"text/plain":["((138846, 15), (283726, 31))"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["fraud_data.shape, credit_data.shape"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"data":{"text/plain":["(Index(['user_id', 'purchase_value', 'source', 'browser', 'sex', 'age',\n","        'ip_address', 'Class', 'country', 'lower_bound_ip_addres',\n","        'upper_bound_ip_adress', 'signup_purchase_diff', 'transaction_count',\n","        'hour_of_day', 'day_of_week'],\n","       dtype='object'),\n"," Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',\n","        'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',\n","        'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',\n","        'Class'],\n","       dtype='object'))"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["fraud_data.columns, credit_data.columns"]},{"cell_type":"markdown","metadata":{"_cell_guid":"a1c436e7-42e4-474c-8a8f-b00592a274ff","_uuid":"79aa1d6d-a15f-4224-815b-4344da833bbf","trusted":true},"source":["## Model Building"]},{"cell_type":"code","execution_count":5,"metadata":{"_cell_guid":"a49734b2-8a44-42d2-8b30-bd5a916c17eb","_uuid":"ab90e452-bf1c-4287-a72e-c1599293b5b7","collapsed":false,"execution":{"iopub.execute_input":"2024-10-18T12:12:29.677630Z","iopub.status.busy":"2024-10-18T12:12:29.677086Z","iopub.status.idle":"2024-10-18T12:13:18.305808Z","shell.execute_reply":"2024-10-18T12:13:18.304575Z","shell.execute_reply.started":"2024-10-18T12:12:29.677583Z"},"jupyter":{"outputs_hidden":false},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["(array([0, 1], dtype=int64), array([125849,  12997], dtype=int64))\n","(array([0, 1], dtype=int64), array([283253,    473], dtype=int64))\n"]}],"source":["# Prepare data for the model (e-commerce)\n","X1 = fraud_data.drop(columns=['Class'])\n","X2 = credit_data.drop(columns=['Class'])\n","\n","y1 = fraud_data['Class']\n","y2 = credit_data['Class']\n","\n","print(np.unique(y1, return_counts=True))\n","print(np.unique(y2, return_counts=True))"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[],"source":["X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)\n","\n","X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)\n"]},{"cell_type":"markdown","metadata":{},"source":["## Class Balancing with SMOTE"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["smt = SMOTETomek(random_state=42)\n","X_train1, y_train1 = smt.fit_resample(X_train1, y_train1)\n","X_train2, y_train2 = smt.fit_resample(X_train2, y_train2)\n","\n","print(np.unique(X_train1, return_counts=True))\n","print(np.unique(X_train2, return_counts=True))\n","print(np.unique(y_train1, return_counts=True))\n","print(np.unique(y_train2, return_counts=True))"]},{"cell_type":"markdown","metadata":{},"source":["### Logistic Regression"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["              precision    recall  f1-score   support\n","\n","           0       0.91      1.00      0.95     25193\n","           1       0.00      0.00      0.00      2577\n","\n","    accuracy                           0.91     27770\n","   macro avg       0.45      0.50      0.48     27770\n","weighted avg       0.82      0.91      0.86     27770\n","\n"]}],"source":["log_reg = LogisticRegression(C=1, solver='liblinear')\n","log_reg.fit(X_train, y_train)\n","y_pred_log_reg = log_reg.predict(X_test)\n","print(classification_report(y_test, y_pred_log_reg))"]},{"cell_type":"markdown","metadata":{},"source":["### Random Forest"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["              precision    recall  f1-score   support\n","\n","           0       0.96      1.00      0.98     25193\n","           1       1.00      0.55      0.71      2577\n","\n","    accuracy                           0.96     27770\n","   macro avg       0.98      0.77      0.84     27770\n","weighted avg       0.96      0.96      0.95     27770\n","\n"]}],"source":["rf_clf = RandomForestClassifier(n_estimators=30, max_depth=3)\n","rf_clf.fit(X_train, y_train)\n","y_pred_rf = rf_clf.predict(X_test)\n","print(classification_report(y_test, y_pred_rf))"]},{"cell_type":"markdown","metadata":{},"source":["### XGBoost "]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":["              precision    recall  f1-score   support\n","\n","           0       0.96      1.00      0.98     25193\n","           1       0.94      0.55      0.69      2577\n","\n","    accuracy                           0.96     27770\n","   macro avg       0.95      0.77      0.84     27770\n","weighted avg       0.95      0.96      0.95     27770\n","\n"]}],"source":["xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')\n","xgb_clf.fit(X_train, y_train)\n","y_pred_xgb = xgb_clf.predict(X_test)\n","print(classification_report(y_test, y_pred_xgb))"]},{"cell_type":"markdown","metadata":{},"source":["## Experiments"]},{"cell_type":"code","execution_count":50,"metadata":{},"outputs":[],"source":["datasets = {\n","    \"fraud_data\": (X_train1, y_train1, X_test1, y_test1),\n","    \"credit_data\": (X_train2, y_train2, X_test2, y_test2),\n","}"]},{"cell_type":"code","execution_count":54,"metadata":{},"outputs":[],"source":["models = [\n","    (\n","        \"Logistic Regression\", \n","        LogisticRegression(C=1, solver='liblinear')\n","    ),\n","    (\n","        \"Random Forest\", \n","        RandomForestClassifier(n_estimators=30, max_depth=3)\n","    ),\n","    (\n","        \"XGBClassifier\",\n","        XGBClassifier(use_label_encoder=False, eval_metric='logloss')\n","    )\n","]"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["2024/10/20 13:43:16 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 13:43:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: http://localhost:5000/#/experiments/935012191474515353/runs/66416527b5ad44a0be0886ed9e43b658.\n","2024/10/20 13:43:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191474515353.\n","2024/10/20 13:43:20 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 13:43:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://localhost:5000/#/experiments/935012191474515353/runs/3c3923af256d4ca4947318c3e70a6fbf.\n","2024/10/20 13:43:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191474515353.\n","2024/10/20 13:43:25 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 13:43:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBClassifier at: http://localhost:5000/#/experiments/935012191474515353/runs/019f76c5419a4e24a15a654ad2358b1c.\n","2024/10/20 13:43:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191474515353.\n","2024/10/20 13:43:28 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 13:43:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBClassifier With SMOTE at: http://localhost:5000/#/experiments/935012191474515353/runs/b89994c7190a441495941622db7bf9ac.\n","2024/10/20 13:43:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191474515353.\n"]}],"source":["# Initialize MLflow\n","mlflow.set_experiment(\"Fraud Detection Models - Single Dataset\")\n","mlflow.set_tracking_uri(\"http://localhost:5000\")\n","\n","reports = []\n","\n","for model_name, model, train_set, test_set in models:\n","    X_train = train_set[0]\n","    y_train = train_set[1]\n","    X_test = test_set[0]\n","    y_test = test_set[1]\n","    \n","    model.fit(X_train, y_train)\n","    y_pred = model.predict(X_test)\n","    report = classification_report(y_test, y_pred, output_dict=True)\n","    reports.append(report)\n","\n","for i, element in enumerate(models):\n","    model_name = element[0]\n","    model = element[1]\n","    report = reports[i]\n","    \n","    with mlflow.start_run(run_name=model_name):        \n","        mlflow.log_param(\"model\", model_name)\n","        mlflow.log_metric('accuracy', report['accuracy'])\n","        mlflow.log_metric('recall_class_1', report['1']['recall'])\n","        mlflow.log_metric('recall_class_0', report['0']['recall'])\n","        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])        \n","        \n","        if \"XGB\" in model_name:\n","            mlflow.xgboost.log_model(model, \"model\")\n","        else:\n","            mlflow.sklearn.log_model(model, \"model\")  "]},{"cell_type":"code","execution_count":55,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["2024/10/20 20:16:21 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression_fraud_data at: http://localhost:5000/#/experiments/478268722598582565/runs/8273f777699647a39535d873391b9d13.\n","2024/10/20 20:16:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:24 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest_fraud_data at: http://localhost:5000/#/experiments/478268722598582565/runs/65913aaab7ce49948b90f6752b771312.\n","2024/10/20 20:16:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:27 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBClassifier_fraud_data at: http://localhost:5000/#/experiments/478268722598582565/runs/e262b2c32c8442b2a720a086f36a2f63.\n","2024/10/20 20:16:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:52 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression_credit_data at: http://localhost:5000/#/experiments/478268722598582565/runs/37abcc1c28b54206bffc2f7695ca9d49.\n","2024/10/20 20:16:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:56 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:56 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest_credit_data at: http://localhost:5000/#/experiments/478268722598582565/runs/afef239a5b3845b082ab9368d9621edd.\n","2024/10/20 20:16:56 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:59 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBClassifier_credit_data at: http://localhost:5000/#/experiments/478268722598582565/runs/bcfb79f52acc4f82bf61298763c208df.\n","2024/10/20 20:16:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n"]}],"source":["# Initialize MLflow\n","mlflow.set_experiment(\"Fraud Detection Models - 2 Datasets\")\n","mlflow.set_tracking_uri(\"http://localhost:5000\")\n","\n","# Iterate through datasets\n","for dataset_name, (X_train, y_train, X_test, y_test) in datasets.items():\n","    reports = []\n","\n","    # Train each model on the current dataset\n","    for model_name, model in models:\n","        model.fit(X_train, y_train)\n","        y_pred = model.predict(X_test)\n","        report = classification_report(y_test, y_pred, output_dict=True)\n","        reports.append(report)\n","\n","    # Log each model's performance metrics to MLflow\n","    for i, (model_name, model) in enumerate(models):\n","        report = reports[i]\n","\n","        with mlflow.start_run(run_name=f\"{model_name}_{dataset_name}\"):\n","            mlflow.log_param(\"model\", model_name)\n","            mlflow.log_param(\"dataset\", dataset_name)\n","            mlflow.log_metric('accuracy', report['accuracy'])\n","            mlflow.log_metric('recall_class_1', report['1']['recall'])\n","            mlflow.log_metric('recall_class_0', report['0']['recall'])\n","            mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])\n","\n","            # Log the model using the appropriate MLflow method\n","            if \"XGBoost\" in model_name:\n","                mlflow.xgboost.log_model(model, \"model\")\n","            else:\n","                mlflow.sklearn.log_model(model, \"model\")\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"datasetId":5896976,"sourceId":9653739,"sourceType":"datasetVersion"}],"dockerImageVersionId":30786,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":".week89","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.0"}},"nbformat":4,"nbformat_minor":4}
diff --git a/scripts/validate_model_for_deployment.py b/scripts/validate_model_for_deployment.py