remove news summary dataset from repo, download using script in sample (

Azure#2250) * remove news summary dataset from repo, download using script in sample * fix to cli sample to remove dataset from repo * typo fix * formatting
vrxmike · May 10, 2023 · 047e058 · 047e058
1 parent 4c66992
commit 047e058
Show file tree

Hide file tree

Showing 6 changed files with 134 additions and 19 deletions.
diff --git a/cli/foundation-models/system/finetune/summarization/download-dataset.py b/cli/foundation-models/system/finetune/summarization/download-dataset.py
@@ -0,0 +1,87 @@
+# import library to parse command line arguments
+import argparse, os
+import pandas as pd
+import os
+
+parser = argparse.ArgumentParser()
+# add an argument to specify a dataset name to download
+parser.add_argument("--dataset", type=str, default="cnn_dailymail", help="dataset name")
+# add an argument to specify the config name of the dataset
+parser.add_argument(
+    "--config_name", type=str, default="3.0.0", help="config name of the dataset"
+)
+# argument to save a fraction of the dataset
+parser.add_argument(
+    "--fraction", type=float, default=0.05, help="fraction of the dataset to save"
+)
+# add an argument to specify the directory to download the dataset to
+parser.add_argument(
+    "--download_dir",
+    type=str,
+    default="./news-summary-dataset",
+    help="directory to download the dataset to",
+)
+args = parser.parse_args()
+
+# create the download directory if it does not exist
+if not os.path.exists(args.download_dir):
+    os.makedirs(args.download_dir)
+
+# import hugging face datasets library
+from datasets import load_dataset, get_dataset_split_names
+
+for split in get_dataset_split_names(args.dataset, config_name=args.config_name):
+    print(f"Loading {split} split of {args.dataset} dataset...")
+    # load the split of the dataset
+    dataset = load_dataset(args.dataset, args.config_name, split=split)
+    # save the split of the dataset to the download directory as json lines file
+    dataset.select(range(int(dataset.num_rows * args.fraction))).to_json(
+        os.path.join(args.download_dir, f"{split}.jsonl")
+    )
+
+train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True)
+validation_df = pd.read_json(
+    os.path.join(args.download_dir, "validation.jsonl"), lines=True
+)
+# this dataset doesn't have test data, so split the validation_df into test_df and validation_df
+test_df = validation_df.sample(frac=0.5, random_state=42)
+validation_df.drop(test_df.index, inplace=True)
+# drop the id column as it is not needed for fine tuning
+train_df.drop(columns=["id"], inplace=True)
+validation_df.drop(columns=["id"], inplace=True)
+test_df.drop(columns=["id"], inplace=True)
+
+
+# save 20% of the rows from the dataframes into files with small_ prefix in the ./news-summary-dataset folder
+train_df.sample(frac=0.2).to_json(
+    os.path.join(args.download_dir, "small_train.jsonl"), orient="records", lines=True
+)
+validation_df.sample(frac=0.2).to_json(
+    os.path.join(args.download_dir, "small_validation.jsonl"),
+    orient="records",
+    lines=True,
+)
+test_df.sample(frac=0.2).to_json(
+    os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True
+)
+
+
+# generate sample scoring data
+# read ./news-summary-dataset/small_test.jsonl into a pandas dataframe
+import pandas as pd
+import json
+
+test_df = pd.read_json(
+    os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True
+)
+# take 1 random sample
+test_df = test_df.sample(n=1)
+# rebuild index
+test_df.reset_index(drop=True, inplace=True)
+# rename the highlights column to ground_truth_summary
+test_df.rename(columns={"highlights": "ground_truth_summary"}, inplace=True)
+# create a json object with the key as "inputs" and value as a list of values from the article column of the test dataframe
+test_json = {"inputs": {"input_string": test_df["article"].tolist()}}
+# save the json object to a file named sample_score.json in the ./emotion-dataset folder
+with open(os.path.join(args.download_dir, "sample_score.json"), "w") as f:
+    json.dump(test_json, f)
diff --git a/cli/foundation-models/system/finetune/summarization/news-summary.sh b/cli/foundation-models/system/finetune/summarization/news-summary.sh
@@ -3,9 +3,9 @@ set -x
 # the data files are available in the same folder as the above notebook
 
 # script inputs
-subscription_id="<SUBSCRIPTION_ID>"
-resource_group_name="<RESOURCE_GROUP>"
-workspace_name="<WORKSPACE_NAME>"
+subscription_id="21d8f407-c4c4-452e-87a4-e609bfb86248" #"<SUBSCRIPTION_ID>"
+resource_group_name="rg-contoso-819prod" #"<RESOURCE_GROUP>",
+workspace_name="mlw-contoso-819prod" #"WORKSPACE_NAME>",
 registry_name="azureml"
 
 compute_cluster="gpu-cluster-big"
@@ -27,15 +27,15 @@ deployment_sku="Standard_DS3_v2"
 
 
 # training data
-train_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_train.jsonl"
+train_data="./news-summary-dataset/small_train.jsonl"
 # validation data
-validation_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_validation.jsonl"
+validation_data="./news-summary-dataset/small_validation.jsonl"
 # test data
-test_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_test.jsonl"
+test_data="./news-summary-dataset/small_test.jsonl"
 # evaluation config
-evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/summarization/summarization-config.json"
+evaluation_config="./summarization-config.json"
 # scoring_file
-scoring_file="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/sample_score.json"
+scoring_file="./news-summary-dataset/sample_score.json"
 
 # finetuning job parameters
 finetuning_pipeline_component="summarization_pipeline"
@@ -49,6 +49,8 @@ number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs avail
 num_train_epochs=3
 learning_rate=2e-5
 
+
+
 # 1. Setup pre-requisites
 
 if [ "$subscription_id" = "<SUBSCRIPTION_ID>" ] || \
@@ -73,6 +75,13 @@ else
     }
 fi
 
+# download the dataset
+
+python ./download-dataset.py || {
+    echo "Failed to download dataset"
+    exit 1
+}
+
 # 2. Check if the model exists in the registry
 # need to confirm model show command works for registries outside the tenant (aka system registry)
 if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name 

diff --git a/cli/foundation-models/system/finetune/summarization/summarization-config.json b/cli/foundation-models/system/finetune/summarization/summarization-config.json
@@ -0,0 +1,5 @@
+{
+    "metrics": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
+    "aggregator": true,
+    "stemmer": true
+}
diff --git a/.../news-summary-dataset/download-dataset.py → ...inetune/summarization/download-dataset.py b/.../news-summary-dataset/download-dataset.py → ...inetune/summarization/download-dataset.py
@@ -3,10 +3,10 @@
 
 parser = argparse.ArgumentParser()
 # add an argument to specify a dataset name to download
-parser.add_argument("--dataset", type=str, default="squad", help="dataset name")
+parser.add_argument("--dataset", type=str, default="cnn_dailymail", help="dataset name")
 # add an argument to specify the config name of the dataset
 parser.add_argument(
-    "--config_name", type=str, default="plain_text", help="config name of the dataset"
+    "--config_name", type=str, default="3.0.0", help="config name of the dataset"
 )
 # argument to save a fraction of the dataset
 parser.add_argument(
@@ -16,7 +16,7 @@
 parser.add_argument(
     "--download_dir",
     type=str,
-    default="data",
+    default="./news-summary-dataset",
     help="directory to download the dataset to",
 )
 args = parser.parse_args()

diff --git a/...on/foundation-models/system/finetune/summarization/news-summary-dataset/sample_score.json b/...on/foundation-models/system/finetune/summarization/news-summary-dataset/sample_score.json
diff --git a/sdk/python/foundation-models/system/finetune/summarization/news-summary.ipynb b/sdk/python/foundation-models/system/finetune/summarization/news-summary.ipynb
@@ -10,7 +10,7 @@
     "This sample shows how to use `summarization` components from the `azureml` system registry to fine tune a model to generate summary of a news article. We then deploy it to an online endpoint for real time inference. The model is trained on tiny sample of the dataset with a small number of epochs to illustrate the fine tuning approach.\n",
     "\n",
     "### Training data\n",
-    "We will use the [CNN DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. A copy of this dataset is available in the [news-summary-dataset](./news-summary-dataset/) folder for easy access. \n",
+    "We will use the [CNN DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. \n",
     "\n",
     "### Model\n",
     "Models that can perform the `translation` task are generally good foundation models to fine tune for `summarization`. We will use the `t5-small` model in this notebook. If you opened this notebook from a specific model card, remember to replace the specific model name. Optionally, if you need to fine tune a model that is available on HuggingFace, but not available in `azureml` system registry, you can either [import](https://github.com/Azure/azureml-examples) the model or use the `huggingface_id` parameter instruct the components to pull the model directly from HuggingFace.  \n",
@@ -87,7 +87,7 @@
     "        credential,\n",
     "        subscription_id=\"<SUBSCRIPTION_ID>\",\n",
     "        resource_group_name=\"<RESOURCE_GROUP>\",\n",
-    "        workspace_name=\"<WORKSPACE_NAME>\",\n",
+    "        workspace_name=\"WORKSPACE_NAME>\",\n",
     "    )\n",
     "\n",
     "# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n",
@@ -175,10 +175,25 @@
     "> The [CNN DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset is larger than 1GB when uncompressed. The [download-dataset.py](./news-summary-dataset/download-dataset.py) has supports downloading a smaller fraction of the dataset. The files in the [](./news-summary-dataset/) folder contain about 3% of the original dataset rows.  \n",
     "\n",
     "A copy of the dataset is available in the [news-summary-dataset](./news-summary-dataset/) folder. \n",
+    "* Download the dataset.\n",
     "* Visualize some data rows. \n",
     "* We want this sample to run quickly, so save smaller `train`, `validation` and `test` files containing 20% of the already trimmed rows. This means the fine tuned model will have lower accuracy, hence it should not be put to real-world use. "
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# download the dataset using the helper script. This needs datasets library: https://pypi.org/project/datasets/\n",
+    "import os\n",
+    "\n",
+    "exit_status = os.system(\"python ./download-dataset.py\")\n",
+    "if exit_status != 0:\n",
+    "    raise Exception(\"Error downloading dataset\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -193,7 +208,9 @@
     "# load the train.jsonl, test.jsonl and validation.jsonl files from the ./news-summary-dataset/ folder and show first 5 rows\n",
     "train_df = pd.read_json(\"./news-summary-dataset/train.jsonl\", lines=True)\n",
     "validation_df = pd.read_json(\"./news-summary-dataset/validation.jsonl\", lines=True)\n",
-    "test_df = pd.read_json(\"./news-summary-dataset/test.jsonl\", lines=True)\n",
+    "# this dataset doesn't have test data, so split the validation_df into test_df and validation_df\n",
+    "test_df = validation_df.sample(frac=0.5, random_state=42)\n",
+    "validation_df.drop(test_df.index, inplace=True)\n",
     "# drop the id column as it is not needed for fine tuning\n",
     "train_df.drop(columns=[\"id\"], inplace=True)\n",
     "validation_df.drop(columns=[\"id\"], inplace=True)\n",
@@ -504,10 +521,8 @@
     "import json\n",
     "\n",
     "test_df = pd.read_json(\n",
-    "    \"./news-summary-dataset/test.jsonl\", orient=\"records\", lines=True\n",
+    "    \"./news-summary-dataset/small_test.jsonl\", orient=\"records\", lines=True\n",
     ")\n",
-    "# drop the id column\n",
-    "test_df.drop(columns=[\"id\"], inplace=True)\n",
     "# take 1 random sample\n",
     "test_df = test_df.sample(n=1)\n",
     "# rebuild index\n",
@@ -605,7 +620,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.16"
+   "version": "3.8.13"
   }
  },
  "nbformat": 4,