From b389a065e7e54fbe856171e2acf9c5f070dcb004 Mon Sep 17 00:00:00 2001 From: tedoaba Date: Mon, 28 Oct 2024 16:35:27 +0300 Subject: [PATCH] database url removed --- docker-compose.yml | 12 ------ ...kaim2-week-8-9-task-2-balanced_class.ipynb | 1 + scripts/validate_model_for_deployment.py | 41 ------------------- 3 files changed, 1 insertion(+), 53 deletions(-) create mode 100644 notebooks/kaim2-week-8-9-task-2-balanced_class.ipynb delete mode 100644 scripts/validate_model_for_deployment.py diff --git a/docker-compose.yml b/docker-compose.yml index d0403a7..69de73d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,18 +17,6 @@ services: volumes: - .:/app - db: - image: postgres:17 - container_name: postgres_db - environment: - POSTGRES_DB: ${POSTGRES_DB} - POSTGRES_USER: ${POSTGRES_USER} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - ports: - - "5432:5432" - volumes: - - postgres_data:/var/lib/postgresql/data - css: image: node:14 container_name: css_processor diff --git a/notebooks/kaim2-week-8-9-task-2-balanced_class.ipynb b/notebooks/kaim2-week-8-9-task-2-balanced_class.ipynb new file mode 100644 index 0000000..a97a86d --- /dev/null +++ b/notebooks/kaim2-week-8-9-task-2-balanced_class.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","metadata":{"_cell_guid":"c53980f3-43e6-4309-b5b3-8c387c94c2ef","_uuid":"ac363b82-57c3-4cb3-87ed-d2c3d463e1a5","trusted":true},"source":["# KAIM Week 8 and 9 Challenges"]},{"cell_type":"markdown","metadata":{"_cell_guid":"9597c964-28b3-47e3-9f51-0691092476ce","_uuid":"977dc833-b4c5-441a-9ca2-d7e8e86bedb1","trusted":true},"source":["## **Task 2: MOdel Building**"]},{"cell_type":"markdown","metadata":{"_cell_guid":"87a00a83-d488-4a01-a3c8-2d2d9f956d41","_uuid":"7ffdbba4-f7f1-4b89-92ea-16cc52919bc6","trusted":true},"source":["## Import Necessary Libraries"]},{"cell_type":"code","execution_count":1,"metadata":{"_cell_guid":"98500711-7c8e-4a73-a926-ad743b2ca99a","_uuid":"05984005-6423-4192-8e7f-856e08148d1f","collapsed":false,"execution":{"iopub.execute_input":"2024-10-18T11:28:10.886046Z","iopub.status.busy":"2024-10-18T11:28:10.885064Z","iopub.status.idle":"2024-10-18T11:28:12.528653Z","shell.execute_reply":"2024-10-18T11:28:12.527539Z","shell.execute_reply.started":"2024-10-18T11:28:10.886001Z"},"jupyter":{"outputs_hidden":false},"trusted":true},"outputs":[],"source":["# Import necessary libraries\n","import pandas as pd\n","import numpy as np\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","from sklearn.preprocessing import StandardScaler, LabelEncoder\n","from sklearn.impute import SimpleImputer\n","from sklearn.model_selection import train_test_split\n","from sklearn.linear_model import LogisticRegression\n","from sklearn.ensemble import RandomForestClassifier\n","from xgboost import XGBClassifier\n","from sklearn.metrics import classification_report\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.neural_network import MLPClassifier\n","from tensorflow.keras.models import Sequential\n","from tensorflow.keras.layers import Dense, Conv2D, Flatten, SimpleRNN, LSTM\n","import mlflow\n","import mlflow.sklearn\n","import mlflow.xgboost\n","import warnings\n","from imblearn.combine import SMOTETomek\n","\n","warnings.filterwarnings('ignore')\n","\n","# Set plot style for better visuals\n","sns.set(style=\"whitegrid\")"]},{"cell_type":"markdown","metadata":{"_cell_guid":"59a324ba-900b-40da-8f79-ee8f7957ef5b","_uuid":"8c78d907-d277-48a3-951f-ee7f01b6a7a7","trusted":true},"source":["## Load Datasets"]},{"cell_type":"code","execution_count":2,"metadata":{"_cell_guid":"98601976-817f-413e-ad3a-8c7eedb11606","_uuid":"b72ec9f6-c664-4339-8d29-555121dbd61f","collapsed":false,"execution":{"iopub.execute_input":"2024-10-18T11:28:15.198096Z","iopub.status.busy":"2024-10-18T11:28:15.197361Z","iopub.status.idle":"2024-10-18T11:28:20.098319Z","shell.execute_reply":"2024-10-18T11:28:20.097043Z","shell.execute_reply.started":"2024-10-18T11:28:15.198047Z"},"jupyter":{"outputs_hidden":false},"trusted":true},"outputs":[],"source":["# Load the datasets\n","fraud_data = pd.read_csv('../data/cleaned_data_1.csv')\n","credit_data = pd.read_csv('../data/cleaned_data_2.csv')"]},{"cell_type":"code","execution_count":3,"metadata":{},"outputs":[{"data":{"text/plain":["((138846, 15), (283726, 31))"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["fraud_data.shape, credit_data.shape"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[{"data":{"text/plain":["(Index(['user_id', 'purchase_value', 'source', 'browser', 'sex', 'age',\n"," 'ip_address', 'Class', 'country', 'lower_bound_ip_addres',\n"," 'upper_bound_ip_adress', 'signup_purchase_diff', 'transaction_count',\n"," 'hour_of_day', 'day_of_week'],\n"," dtype='object'),\n"," Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',\n"," 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',\n"," 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',\n"," 'Class'],\n"," dtype='object'))"]},"execution_count":4,"metadata":{},"output_type":"execute_result"}],"source":["fraud_data.columns, credit_data.columns"]},{"cell_type":"markdown","metadata":{"_cell_guid":"a1c436e7-42e4-474c-8a8f-b00592a274ff","_uuid":"79aa1d6d-a15f-4224-815b-4344da833bbf","trusted":true},"source":["## Model Building"]},{"cell_type":"code","execution_count":5,"metadata":{"_cell_guid":"a49734b2-8a44-42d2-8b30-bd5a916c17eb","_uuid":"ab90e452-bf1c-4287-a72e-c1599293b5b7","collapsed":false,"execution":{"iopub.execute_input":"2024-10-18T12:12:29.677630Z","iopub.status.busy":"2024-10-18T12:12:29.677086Z","iopub.status.idle":"2024-10-18T12:13:18.305808Z","shell.execute_reply":"2024-10-18T12:13:18.304575Z","shell.execute_reply.started":"2024-10-18T12:12:29.677583Z"},"jupyter":{"outputs_hidden":false},"trusted":true},"outputs":[{"name":"stdout","output_type":"stream","text":["(array([0, 1], dtype=int64), array([125849, 12997], dtype=int64))\n","(array([0, 1], dtype=int64), array([283253, 473], dtype=int64))\n"]}],"source":["# Prepare data for the model (e-commerce)\n","X1 = fraud_data.drop(columns=['Class'])\n","X2 = credit_data.drop(columns=['Class'])\n","\n","y1 = fraud_data['Class']\n","y2 = credit_data['Class']\n","\n","print(np.unique(y1, return_counts=True))\n","print(np.unique(y2, return_counts=True))"]},{"cell_type":"code","execution_count":6,"metadata":{},"outputs":[],"source":["X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)\n","\n","X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)\n"]},{"cell_type":"markdown","metadata":{},"source":["## Class Balancing with SMOTE"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["smt = SMOTETomek(random_state=42)\n","X_train1, y_train1 = smt.fit_resample(X_train1, y_train1)\n","X_train2, y_train2 = smt.fit_resample(X_train2, y_train2)\n","\n","print(np.unique(X_train1, return_counts=True))\n","print(np.unique(X_train2, return_counts=True))\n","print(np.unique(y_train1, return_counts=True))\n","print(np.unique(y_train2, return_counts=True))"]},{"cell_type":"markdown","metadata":{},"source":["### Logistic Regression"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" precision recall f1-score support\n","\n"," 0 0.91 1.00 0.95 25193\n"," 1 0.00 0.00 0.00 2577\n","\n"," accuracy 0.91 27770\n"," macro avg 0.45 0.50 0.48 27770\n","weighted avg 0.82 0.91 0.86 27770\n","\n"]}],"source":["log_reg = LogisticRegression(C=1, solver='liblinear')\n","log_reg.fit(X_train, y_train)\n","y_pred_log_reg = log_reg.predict(X_test)\n","print(classification_report(y_test, y_pred_log_reg))"]},{"cell_type":"markdown","metadata":{},"source":["### Random Forest"]},{"cell_type":"code","execution_count":9,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" precision recall f1-score support\n","\n"," 0 0.96 1.00 0.98 25193\n"," 1 1.00 0.55 0.71 2577\n","\n"," accuracy 0.96 27770\n"," macro avg 0.98 0.77 0.84 27770\n","weighted avg 0.96 0.96 0.95 27770\n","\n"]}],"source":["rf_clf = RandomForestClassifier(n_estimators=30, max_depth=3)\n","rf_clf.fit(X_train, y_train)\n","y_pred_rf = rf_clf.predict(X_test)\n","print(classification_report(y_test, y_pred_rf))"]},{"cell_type":"markdown","metadata":{},"source":["### XGBoost "]},{"cell_type":"code","execution_count":10,"metadata":{},"outputs":[{"name":"stdout","output_type":"stream","text":[" precision recall f1-score support\n","\n"," 0 0.96 1.00 0.98 25193\n"," 1 0.94 0.55 0.69 2577\n","\n"," accuracy 0.96 27770\n"," macro avg 0.95 0.77 0.84 27770\n","weighted avg 0.95 0.96 0.95 27770\n","\n"]}],"source":["xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')\n","xgb_clf.fit(X_train, y_train)\n","y_pred_xgb = xgb_clf.predict(X_test)\n","print(classification_report(y_test, y_pred_xgb))"]},{"cell_type":"markdown","metadata":{},"source":["## Experiments"]},{"cell_type":"code","execution_count":50,"metadata":{},"outputs":[],"source":["datasets = {\n"," \"fraud_data\": (X_train1, y_train1, X_test1, y_test1),\n"," \"credit_data\": (X_train2, y_train2, X_test2, y_test2),\n","}"]},{"cell_type":"code","execution_count":54,"metadata":{},"outputs":[],"source":["models = [\n"," (\n"," \"Logistic Regression\", \n"," LogisticRegression(C=1, solver='liblinear')\n"," ),\n"," (\n"," \"Random Forest\", \n"," RandomForestClassifier(n_estimators=30, max_depth=3)\n"," ),\n"," (\n"," \"XGBClassifier\",\n"," XGBClassifier(use_label_encoder=False, eval_metric='logloss')\n"," )\n","]"]},{"cell_type":"code","execution_count":16,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["2024/10/20 13:43:16 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 13:43:16 INFO mlflow.tracking._tracking_service.client: ๐Ÿƒ View run Logistic Regression at: http://localhost:5000/#/experiments/935012191474515353/runs/66416527b5ad44a0be0886ed9e43b658.\n","2024/10/20 13:43:16 INFO mlflow.tracking._tracking_service.client: ๐Ÿงช View experiment at: http://localhost:5000/#/experiments/935012191474515353.\n","2024/10/20 13:43:20 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 13:43:21 INFO mlflow.tracking._tracking_service.client: ๐Ÿƒ View run Random Forest at: http://localhost:5000/#/experiments/935012191474515353/runs/3c3923af256d4ca4947318c3e70a6fbf.\n","2024/10/20 13:43:21 INFO mlflow.tracking._tracking_service.client: ๐Ÿงช View experiment at: http://localhost:5000/#/experiments/935012191474515353.\n","2024/10/20 13:43:25 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 13:43:25 INFO mlflow.tracking._tracking_service.client: ๐Ÿƒ View run XGBClassifier at: http://localhost:5000/#/experiments/935012191474515353/runs/019f76c5419a4e24a15a654ad2358b1c.\n","2024/10/20 13:43:25 INFO mlflow.tracking._tracking_service.client: ๐Ÿงช View experiment at: http://localhost:5000/#/experiments/935012191474515353.\n","2024/10/20 13:43:28 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 13:43:29 INFO mlflow.tracking._tracking_service.client: ๐Ÿƒ View run XGBClassifier With SMOTE at: http://localhost:5000/#/experiments/935012191474515353/runs/b89994c7190a441495941622db7bf9ac.\n","2024/10/20 13:43:29 INFO mlflow.tracking._tracking_service.client: ๐Ÿงช View experiment at: http://localhost:5000/#/experiments/935012191474515353.\n"]}],"source":["# Initialize MLflow\n","mlflow.set_experiment(\"Fraud Detection Models - Single Dataset\")\n","mlflow.set_tracking_uri(\"http://localhost:5000\")\n","\n","reports = []\n","\n","for model_name, model, train_set, test_set in models:\n"," X_train = train_set[0]\n"," y_train = train_set[1]\n"," X_test = test_set[0]\n"," y_test = test_set[1]\n"," \n"," model.fit(X_train, y_train)\n"," y_pred = model.predict(X_test)\n"," report = classification_report(y_test, y_pred, output_dict=True)\n"," reports.append(report)\n","\n","for i, element in enumerate(models):\n"," model_name = element[0]\n"," model = element[1]\n"," report = reports[i]\n"," \n"," with mlflow.start_run(run_name=model_name): \n"," mlflow.log_param(\"model\", model_name)\n"," mlflow.log_metric('accuracy', report['accuracy'])\n"," mlflow.log_metric('recall_class_1', report['1']['recall'])\n"," mlflow.log_metric('recall_class_0', report['0']['recall'])\n"," mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score']) \n"," \n"," if \"XGB\" in model_name:\n"," mlflow.xgboost.log_model(model, \"model\")\n"," else:\n"," mlflow.sklearn.log_model(model, \"model\") "]},{"cell_type":"code","execution_count":55,"metadata":{},"outputs":[{"name":"stderr","output_type":"stream","text":["2024/10/20 20:16:21 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:21 INFO mlflow.tracking._tracking_service.client: ๐Ÿƒ View run Logistic Regression_fraud_data at: http://localhost:5000/#/experiments/478268722598582565/runs/8273f777699647a39535d873391b9d13.\n","2024/10/20 20:16:21 INFO mlflow.tracking._tracking_service.client: ๐Ÿงช View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:24 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:24 INFO mlflow.tracking._tracking_service.client: ๐Ÿƒ View run Random Forest_fraud_data at: http://localhost:5000/#/experiments/478268722598582565/runs/65913aaab7ce49948b90f6752b771312.\n","2024/10/20 20:16:24 INFO mlflow.tracking._tracking_service.client: ๐Ÿงช View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:27 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:28 INFO mlflow.tracking._tracking_service.client: ๐Ÿƒ View run XGBClassifier_fraud_data at: http://localhost:5000/#/experiments/478268722598582565/runs/e262b2c32c8442b2a720a086f36a2f63.\n","2024/10/20 20:16:28 INFO mlflow.tracking._tracking_service.client: ๐Ÿงช View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:52 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:52 INFO mlflow.tracking._tracking_service.client: ๐Ÿƒ View run Logistic Regression_credit_data at: http://localhost:5000/#/experiments/478268722598582565/runs/37abcc1c28b54206bffc2f7695ca9d49.\n","2024/10/20 20:16:52 INFO mlflow.tracking._tracking_service.client: ๐Ÿงช View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:56 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:56 INFO mlflow.tracking._tracking_service.client: ๐Ÿƒ View run Random Forest_credit_data at: http://localhost:5000/#/experiments/478268722598582565/runs/afef239a5b3845b082ab9368d9621edd.\n","2024/10/20 20:16:56 INFO mlflow.tracking._tracking_service.client: ๐Ÿงช View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n","2024/10/20 20:16:59 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\n","2024/10/20 20:16:59 INFO mlflow.tracking._tracking_service.client: ๐Ÿƒ View run XGBClassifier_credit_data at: http://localhost:5000/#/experiments/478268722598582565/runs/bcfb79f52acc4f82bf61298763c208df.\n","2024/10/20 20:16:59 INFO mlflow.tracking._tracking_service.client: ๐Ÿงช View experiment at: http://localhost:5000/#/experiments/478268722598582565.\n"]}],"source":["# Initialize MLflow\n","mlflow.set_experiment(\"Fraud Detection Models - 2 Datasets\")\n","mlflow.set_tracking_uri(\"http://localhost:5000\")\n","\n","# Iterate through datasets\n","for dataset_name, (X_train, y_train, X_test, y_test) in datasets.items():\n"," reports = []\n","\n"," # Train each model on the current dataset\n"," for model_name, model in models:\n"," model.fit(X_train, y_train)\n"," y_pred = model.predict(X_test)\n"," report = classification_report(y_test, y_pred, output_dict=True)\n"," reports.append(report)\n","\n"," # Log each model's performance metrics to MLflow\n"," for i, (model_name, model) in enumerate(models):\n"," report = reports[i]\n","\n"," with mlflow.start_run(run_name=f\"{model_name}_{dataset_name}\"):\n"," mlflow.log_param(\"model\", model_name)\n"," mlflow.log_param(\"dataset\", dataset_name)\n"," mlflow.log_metric('accuracy', report['accuracy'])\n"," mlflow.log_metric('recall_class_1', report['1']['recall'])\n"," mlflow.log_metric('recall_class_0', report['0']['recall'])\n"," mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])\n","\n"," # Log the model using the appropriate MLflow method\n"," if \"XGBoost\" in model_name:\n"," mlflow.xgboost.log_model(model, \"model\")\n"," else:\n"," mlflow.sklearn.log_model(model, \"model\")\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]}],"metadata":{"kaggle":{"accelerator":"none","dataSources":[{"datasetId":5896976,"sourceId":9653739,"sourceType":"datasetVersion"}],"dockerImageVersionId":30786,"isGpuEnabled":false,"isInternetEnabled":true,"language":"python","sourceType":"notebook"},"kernelspec":{"display_name":".week89","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.0"}},"nbformat":4,"nbformat_minor":4} diff --git a/scripts/validate_model_for_deployment.py b/scripts/validate_model_for_deployment.py deleted file mode 100644 index 219b822..0000000 --- a/scripts/validate_model_for_deployment.py +++ /dev/null @@ -1,41 +0,0 @@ -import mlflow -# Predict on a Pandas DataFrame. -import pandas as pd - -from mlflow import MlflowClient - -client = MlflowClient() -client.create_registered_model("xgb_credit_data") - -client = MlflowClient() -result = client.create_model_version( - name="xgb_credit_data", - source='mlartifacts/377415131768587313/cc1f15c2202b4dff854b81d2d968e4df/artifacts/model', - run_id="cc1f15c2202b4dff854b81d2d968e4df", -) - - -#import pandas as pd - -# Define the input values as a dictionary (as per your example) -input_values = { - 'Time': 0.0, 'V1': -1.359807, 'V2': -0.072781, 'V3': 2.536346, - 'V4': 1.378155, 'V5': -0.338321, 'V6': 0.462388, 'V7': 0.239599, - 'V8': 0.098698, 'V9': 0.363787, 'V10': 0.090794, 'V11': -0.551600, - 'V12': -0.617801, 'V13': -0.991390, 'V14': -0.311169, 'V15': 1.468177, - 'V16': -0.470401, 'V17': 0.207971, 'V18': 0.025791, 'V19': 0.403993, - 'V20': 0.251412, 'V21': -0.018307, 'V22': 0.277838, 'V23': -0.110474, - 'V24': 0.066928, 'V25': 0.128539, 'V26': -0.189115, 'V27': 0.133558, - 'V28': -0.021053, 'Amount': 149.62 -} - -# Convert the dictionary to a Pandas DataFrame -input_df = pd.DataFrame([input_values]) - -# Display the DataFrame -print(input_df) - -# Load model as a PyFuncModel. -loaded_model = mlflow.pyfunc.load_model(result) - -loaded_model.predict(input_df) \ No newline at end of file