diff --git a/.gitignore b/.gitignore index 4394353..3c7b352 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,5 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -artifacts/* \ No newline at end of file +artifacts/* +mlruns/* \ No newline at end of file diff --git a/README.md b/README.md index 4da46bd..decd17f 100644 --- a/README.md +++ b/README.md @@ -10,4 +10,33 @@ Update the configuration manager in src config Update the components Update the pipeline Update the main.py -Update the dvc.yaml \ No newline at end of file +Update the dvc.yaml + + + +MLflow +Documentation + +MLflow tutorial + +cmd +mlflow ui +dagshub +dagshub + +MLFLOW_TRACKING_URI=https://dagshub.com/vasalosi/End-to-End-Chest-Cancer-Classification-using-MLflow.mlflow +MLFLOW_TRACKING_USERNAME=vasalosi +MLFLOW_TRACKING_PASSWORD=2cb87396b30ddd10b37e93c45c1ce2662812de54 +python script.py + +Run this to export as env variables: + +export MLFLOW_TRACKING_URI=https://dagshub.com/vasalosi/End-to-End-Chest-Cancer-Classification-using-MLflow.mlflow + +export MLFLOW_TRACKING_USERNAME=vasalosi + +export MLFLOW_TRACKING_PASSWORD=2cb87396b30ddd10b37e93c45c1ce2662812de54 +DVC cmd +dvc init +dvc repro +dvc dag diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..0911ed8 --- /dev/null +++ b/dvc.lock @@ -0,0 +1,113 @@ +schema: '2.0' +stages: + data_ingestion: + cmd: python src/cnnClassifier/pipeline/stage_01_data_ingestion.py + deps: + - path: config/config.yaml + hash: md5 + md5: 80450104406e3e10b8b2d62cf840b2f2 + size: 594 + - path: src/cnnClassifier/pipeline/stage_01_data_ingestion.py + hash: md5 + md5: 257f1b6398e02f7479ce8922f36b35c8 + size: 908 + outs: + - path: artifacts/data_ingestion/Chest-CT-Scan-data + hash: md5 + md5: 904fa45d934ce879b3b1933dca6cb2f1.dir + size: 49247431 + nfiles: 343 + prepare_base_model: + cmd: python src/cnnClassifier/pipeline/stage_02_prepare_base_model.py + deps: + - path: config/config.yaml + hash: md5 + md5: 80450104406e3e10b8b2d62cf840b2f2 + size: 594 + - path: src/cnnClassifier/pipeline/stage_02_prepare_base_model.py + hash: md5 + md5: 7f9ec5a73931e7bff22705a294223529 + size: 952 + params: + params.yaml: + CLASSES: 2 + IMAGE_SIZE: + - 224 + - 224 + - 3 + INCLUDE_TOP: false + LEARNING_RATE: 0.01 + WEIGHTS: imagenet + outs: + - path: artifacts/prepare_base_model + hash: md5 + md5: 9520d29801a13ca1113ba9ce79fd88d9.dir + size: 118054560 + nfiles: 2 + training: + cmd: python src/cnnClassifier/pipeline/stage_03_model_trainer.py + deps: + - path: artifacts/data_ingestion/Chest-CT-Scan-data + hash: md5 + md5: 904fa45d934ce879b3b1933dca6cb2f1.dir + size: 49247431 + nfiles: 343 + - path: artifacts/prepare_base_model + hash: md5 + md5: 9520d29801a13ca1113ba9ce79fd88d9.dir + size: 118054560 + nfiles: 2 + - path: config/config.yaml + hash: md5 + md5: 80450104406e3e10b8b2d62cf840b2f2 + size: 594 + - path: src/cnnClassifier/pipeline/stage_03_model_trainer.py + hash: md5 + md5: 2f4d245918743185245a30a155be2ec3 + size: 910 + params: + params.yaml: + AUGMENTATION: true + BATCH_SIZE: 16 + EPOCHS: 1 + IMAGE_SIZE: + - 224 + - 224 + - 3 + outs: + - path: artifacts/training/model.h5 + hash: md5 + md5: 531c7725e3b6d0a315faf1312ba5789b + size: 59337520 + evaluation: + cmd: python src/cnnClassifier/pipeline/stage_04_model_evaluation.py + deps: + - path: artifacts/data_ingestion/Chest-CT-Scan-data + hash: md5 + md5: 904fa45d934ce879b3b1933dca6cb2f1.dir + size: 49247431 + nfiles: 343 + - path: artifacts/training/model.h5 + hash: md5 + md5: 531c7725e3b6d0a315faf1312ba5789b + size: 59337520 + - path: config/config.yaml + hash: md5 + md5: 80450104406e3e10b8b2d62cf840b2f2 + size: 594 + - path: src/cnnClassifier/pipeline/stage_04_model_evaluation.py + hash: md5 + md5: 6d19372baf34366679787a0fb1b89f49 + size: 922 + params: + params.yaml: + BATCH_SIZE: 16 + IMAGE_SIZE: + - 224 + - 224 + - 3 + outs: + - path: scores.json + hash: md5 + md5: 4e5cd96340896497805352e330cf4a51 + size: 73 diff --git a/dvc.yaml b/dvc.yaml index e69de29..c34b3c9 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -0,0 +1,54 @@ +stages: + data_ingestion: + cmd: python src/cnnClassifier/pipeline/stage_01_data_ingestion.py + deps: + - src/cnnClassifier/pipeline/stage_01_data_ingestion.py + - config/config.yaml + outs: + - artifacts/data_ingestion/Chest-CT-Scan-data + + + prepare_base_model: + cmd: python src/cnnClassifier/pipeline/stage_02_prepare_base_model.py + deps: + - src/cnnClassifier/pipeline/stage_02_prepare_base_model.py + - config/config.yaml + params: + - IMAGE_SIZE + - INCLUDE_TOP + - CLASSES + - WEIGHTS + - LEARNING_RATE + outs: + - artifacts/prepare_base_model + + + training: + cmd: python src/cnnClassifier/pipeline/stage_03_model_trainer.py + deps: + - src/cnnClassifier/pipeline/stage_03_model_trainer.py + - config/config.yaml + - artifacts/data_ingestion/Chest-CT-Scan-data + - artifacts/prepare_base_model + params: + - IMAGE_SIZE + - EPOCHS + - BATCH_SIZE + - AUGMENTATION + outs: + - artifacts/training/model.h5 + + + evaluation: + cmd: python src/cnnClassifier/pipeline/stage_04_model_evaluation.py + deps: + - src/cnnClassifier/pipeline/stage_04_model_evaluation.py + - config/config.yaml + - artifacts/data_ingestion/Chest-CT-Scan-data + - artifacts/training/model.h5 + params: + - IMAGE_SIZE + - BATCH_SIZE + metrics: + - scores.json: + cache: false \ No newline at end of file diff --git a/main.py b/main.py index 0e950c5..9ffa4f9 100644 --- a/main.py +++ b/main.py @@ -2,7 +2,7 @@ from cnnClassifier.pipeline.stage_01_data_ingestion import DataIngestionTrainingPipeline from cnnClassifier.pipeline.stage_02_prepare_base_model import PrepareBaseModelTrainingPipeline from cnnClassifier.pipeline.stage_03_model_trainer import ModelTrainingPipeline - +from cnnClassifier.pipeline.stage_04_model_evaluation import EvaluationPipeline STAGE_NAME = "Data Ingestion Stage" @@ -35,4 +35,17 @@ logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x") except Exception as e: logger.exception(e) - raise e \ No newline at end of file + raise e + + +STAGE_NAME = "Evaluation stage" +try: + logger.info(f"*******************") + logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") + model_evalution = EvaluationPipeline() + model_evalution.main() + logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x") + +except Exception as e: + logger.exception(e) + raise e \ No newline at end of file diff --git a/research/04_model_evaluation_with_mlflow.ipynb b/research/04_model_evaluation_with_mlflow.ipynb new file mode 100644 index 0000000..6c83213 --- /dev/null +++ b/research/04_model_evaluation_with_mlflow.ipynb @@ -0,0 +1,312 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'c:\\\\Users\\\\vasal\\\\End-to-End-Chest-Cancer-Classification-using-MLflow\\\\research'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%pwd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "os.chdir(\"../\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'c:\\\\Users\\\\vasal\\\\End-to-End-Chest-Cancer-Classification-using-MLflow'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%pwd" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"MLFLOW_TRACKING_URI\"]=\"https://dagshub.com/vasalosi/End-to-End-Chest-Cancer-Classification-using-MLflow.mlflow\"\n", + "os.environ[\"MLFLOW_TRACKING_USERNAME\"]=\"vasalosi\"\n", + "os.environ[\"MLFLOW_TRACKING_PASSWORD\"]=\"2cb87396b30ddd10b37e93c45c1ce2662812de54\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "model = tf.keras.models.load_model(\"artifacts/training/model.h5\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "from pathlib import Path\n", + "\n", + "@dataclass(frozen=True)\n", + "class EvaluationConfig:\n", + " path_of_model: Path\n", + " training_data: Path\n", + " all_params: dict\n", + " mlflow_uri: str\n", + " params_image_size: list\n", + " params_batch_size: int" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from cnnClassifier.constants import *\n", + "from cnnClassifier.utils.common import read_yaml, create_directories, save_json" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "class ConfigurationManager:\n", + " def __init__(\n", + " self, \n", + " config_filepath = CONFIG_FILE_PATH,\n", + " params_filepath = PARAMS_FILE_PATH):\n", + " self.config = read_yaml(config_filepath)\n", + " self.params = read_yaml(params_filepath)\n", + " create_directories([self.config.artifacts_root])\n", + "\n", + " \n", + " def get_evaluation_config(self) -> EvaluationConfig:\n", + " eval_config = EvaluationConfig(\n", + " path_of_model=\"artifacts/training/model.h5\",\n", + " training_data=\"artifacts/data_ingestion/Chest-CT-Scan-data\",\n", + " mlflow_uri=\"https://dagshub.com/vasalosi/End-to-End-Chest-Cancer-Classification-using-MLflow.mlflow\",\n", + " all_params=self.params,\n", + " params_image_size=self.params.IMAGE_SIZE,\n", + " params_batch_size=self.params.BATCH_SIZE\n", + " )\n", + " return eval_config" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "from pathlib import Path\n", + "import mlflow\n", + "import mlflow.keras\n", + "from urllib.parse import urlparse" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "class Evaluation:\n", + " def __init__(self, config: EvaluationConfig):\n", + " self.config = config\n", + "\n", + " \n", + " def _valid_generator(self):\n", + "\n", + " datagenerator_kwargs = dict(\n", + " rescale = 1./255,\n", + " validation_split=0.30\n", + " )\n", + "\n", + " dataflow_kwargs = dict(\n", + " target_size=self.config.params_image_size[:-1],\n", + " batch_size=self.config.params_batch_size,\n", + " interpolation=\"bilinear\"\n", + " )\n", + "\n", + " valid_datagenerator = tf.keras.preprocessing.image.ImageDataGenerator(\n", + " **datagenerator_kwargs\n", + " )\n", + "\n", + " self.valid_generator = valid_datagenerator.flow_from_directory(\n", + " directory=self.config.training_data,\n", + " subset=\"validation\",\n", + " shuffle=False,\n", + " **dataflow_kwargs\n", + " )\n", + "\n", + "\n", + " @staticmethod\n", + " def load_model(path: Path) -> tf.keras.Model:\n", + " return tf.keras.models.load_model(path)\n", + " \n", + "\n", + " def evaluation(self):\n", + " self.model = self.load_model(self.config.path_of_model)\n", + " self._valid_generator()\n", + " self.score = model.evaluate(self.valid_generator)\n", + " self.save_score()\n", + "\n", + " def save_score(self):\n", + " scores = {\"loss\": self.score[0], \"accuracy\": self.score[1]}\n", + " save_json(path=Path(\"scores.json\"), data=scores)\n", + "\n", + " \n", + " def log_into_mlflow(self):\n", + " mlflow.set_registry_uri(self.config.mlflow_uri)\n", + " tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme\n", + " \n", + " with mlflow.start_run():\n", + " mlflow.log_params(self.config.all_params)\n", + " mlflow.log_metrics(\n", + " {\"loss\": self.score[0], \"accuracy\": self.score[1]}\n", + " )\n", + " # Model registry does not work with file store\n", + " if tracking_url_type_store != \"file\":\n", + "\n", + " # Register the model\n", + " # There are other ways to use the Model Registry, which depends on the use case,\n", + " # please refer to the doc for more information:\n", + " # https://mlflow.org/docs/latest/model-registry.html#api-workflow\n", + " mlflow.keras.log_model(self.model, \"model\", registered_model_name=\"VGG16Model\")\n", + " else:\n", + " mlflow.keras.log_model(self.model, \"model\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-03-29 12:45:00,281: INFO: common: yaml file: config\\config.yaml loaded successfully]\n", + "[2024-03-29 12:45:00,285: INFO: common: yaml file: params.yaml loaded successfully]\n", + "[2024-03-29 12:45:00,286: INFO: common: created directory at: artifacts]\n", + "Found 102 images belonging to 2 classes.\n", + "7/7 [==============================] - 11s 1s/step - loss: 19.7448 - accuracy: 0.5686\n", + "[2024-03-29 12:45:11,441: INFO: common: json file saved at: scores.json]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/03/29 12:45:12 WARNING mlflow.tensorflow: You are saving a TensorFlow Core model or Keras model without a signature. Inference with mlflow.pyfunc.spark_udf() will not work unless the model's pyfunc representation accepts pandas DataFrames as inference inputs.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[2024-03-29 12:45:14,072: WARNING: save: Found untraced functions such as _jit_compiled_convolution_op, _jit_compiled_convolution_op, _jit_compiled_convolution_op, _jit_compiled_convolution_op, _jit_compiled_convolution_op while saving (showing 5 of 14). These functions will not be directly callable after loading.]\n", + "INFO:tensorflow:Assets written to: C:\\Users\\vasal\\AppData\\Local\\Temp\\tmpqw435vo9\\model\\data\\model\\assets\n", + "[2024-03-29 12:45:14,577: INFO: builder_impl: Assets written to: C:\\Users\\vasal\\AppData\\Local\\Temp\\tmpqw435vo9\\model\\data\\model\\assets]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\vasal\\anaconda3\\envs\\cancer_research\\lib\\site-packages\\_distutils_hack\\__init__.py:33: UserWarning: Setuptools is replacing distutils.\n", + " warnings.warn(\"Setuptools is replacing distutils.\")\n", + "Registered model 'VGG16Model' already exists. Creating a new version of this model...\n", + "2024/03/29 12:46:18 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: VGG16Model, version 4\n", + "Created version '4' of model 'VGG16Model'.\n" + ] + } + ], + "source": [ + "try:\n", + " config = ConfigurationManager()\n", + " eval_config = config.get_evaluation_config()\n", + " evaluation = Evaluation(eval_config)\n", + " evaluation.evaluation()\n", + " evaluation.log_into_mlflow()\n", + "\n", + "except Exception as e:\n", + " raise e" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "cancer_research", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scores.json b/scores.json new file mode 100644 index 0000000..0ced02d --- /dev/null +++ b/scores.json @@ -0,0 +1,4 @@ +{ + "loss": 9.471105575561523, + "accuracy": 0.5686274766921997 +} \ No newline at end of file diff --git a/src/cnnClassifier/components/model_evaluation_mlflow.py b/src/cnnClassifier/components/model_evaluation_mlflow.py new file mode 100644 index 0000000..a24542b --- /dev/null +++ b/src/cnnClassifier/components/model_evaluation_mlflow.py @@ -0,0 +1,74 @@ +import tensorflow as tf +from pathlib import Path +import mlflow +import mlflow.keras +from urllib.parse import urlparse +from cnnClassifier.entity.config_entity import EvaluationConfig +from cnnClassifier.utils.common import read_yaml, create_directories,save_json + + +class Evaluation: + def __init__(self, config: EvaluationConfig): + self.config = config + + + def _valid_generator(self): + + datagenerator_kwargs = dict( + rescale = 1./255, + validation_split=0.30 + ) + + dataflow_kwargs = dict( + target_size=self.config.params_image_size[:-1], + batch_size=self.config.params_batch_size, + interpolation="bilinear" + ) + + valid_datagenerator = tf.keras.preprocessing.image.ImageDataGenerator( + **datagenerator_kwargs + ) + + self.valid_generator = valid_datagenerator.flow_from_directory( + directory=self.config.training_data, + subset="validation", + shuffle=False, + **dataflow_kwargs + ) + + + @staticmethod + def load_model(path: Path) -> tf.keras.Model: + return tf.keras.models.load_model(path) + + + def evaluation(self): + self.model = self.load_model(self.config.path_of_model) + self._valid_generator() + self.score = self.model.evaluate(self.valid_generator) + self.save_score() + + def save_score(self): + scores = {"loss": self.score[0], "accuracy": self.score[1]} + save_json(path=Path("scores.json"), data=scores) + + + def log_into_mlflow(self): + mlflow.set_registry_uri(self.config.mlflow_uri) + tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme + + with mlflow.start_run(): + mlflow.log_params(self.config.all_params) + mlflow.log_metrics( + {"loss": self.score[0], "accuracy": self.score[1]} + ) + # Model registry does not work with file store + if tracking_url_type_store != "file": + + # Register the model + # There are other ways to use the Model Registry, which depends on the use case, + # please refer to the doc for more information: + # https://mlflow.org/docs/latest/model-registry.html#api-workflow + mlflow.keras.log_model(self.model, "model", registered_model_name="VGG16Model") + else: + mlflow.keras.log_model(self.model, "model") \ No newline at end of file diff --git a/src/cnnClassifier/config/configuration.py b/src/cnnClassifier/config/configuration.py index cd16287..651920b 100644 --- a/src/cnnClassifier/config/configuration.py +++ b/src/cnnClassifier/config/configuration.py @@ -1,9 +1,10 @@ import os from cnnClassifier.constants import * -from cnnClassifier.utils.common import read_yaml, create_directories +from cnnClassifier.utils.common import read_yaml, create_directories, save_json from cnnClassifier.entity.config_entity import (DataIngestionConfig, PrepareBaseModelConfig, - TrainingConfig) + TrainingConfig, + EvaluationConfig) class ConfigurationManager: def __init__( @@ -71,4 +72,16 @@ def get_training_config(self) -> TrainingConfig: params_image_size=params.IMAGE_SIZE ) - return training_config \ No newline at end of file + return training_config + + + def get_evaluation_config(self) -> EvaluationConfig: + eval_config = EvaluationConfig( + path_of_model="artifacts/training/model.h5", + training_data="artifacts/data_ingestion/Chest-CT-Scan-data", + mlflow_uri="https://dagshub.com/vasalosi/End-to-End-Chest-Cancer-Classification-using-MLflow.mlflow", + all_params=self.params, + params_image_size=self.params.IMAGE_SIZE, + params_batch_size=self.params.BATCH_SIZE + ) + return eval_config diff --git a/src/cnnClassifier/entity/config_entity.py b/src/cnnClassifier/entity/config_entity.py index 2baada7..50079e4 100644 --- a/src/cnnClassifier/entity/config_entity.py +++ b/src/cnnClassifier/entity/config_entity.py @@ -34,4 +34,14 @@ class TrainingConfig: params_epochs: int params_batch_size: int params_is_augmentation: bool - params_image_size: list \ No newline at end of file + params_image_size: list + + +@dataclass(frozen=True) +class EvaluationConfig: + path_of_model: Path + training_data: Path + all_params: dict + mlflow_uri: str + params_image_size: list + params_batch_size: int \ No newline at end of file diff --git a/src/cnnClassifier/pipeline/stage_04_model_evaluation.py b/src/cnnClassifier/pipeline/stage_04_model_evaluation.py new file mode 100644 index 0000000..48819dd --- /dev/null +++ b/src/cnnClassifier/pipeline/stage_04_model_evaluation.py @@ -0,0 +1,34 @@ +from cnnClassifier.config.configuration import ConfigurationManager +from cnnClassifier.components.model_evaluation_mlflow import Evaluation +from cnnClassifier import logger + + + +STAGE_NAME = "Evaluation stage" + + +class EvaluationPipeline: + def __init__(self): + pass + + def main(self): + config = ConfigurationManager() + eval_config = config.get_evaluation_config() + evaluation = Evaluation(eval_config) + evaluation.evaluation() + evaluation.save_score() + evaluation.log_into_mlflow() + + + + +if __name__ == '__main__': + try: + logger.info(f"*******************") + logger.info(f">>>>>> stage {STAGE_NAME} started <<<<<<") + obj = EvaluationPipeline() + obj.main() + logger.info(f">>>>>> stage {STAGE_NAME} completed <<<<<<\n\nx==========x") + except Exception as e: + logger.exception(e) + raise e \ No newline at end of file