Skip to content

Commit

Permalink
remove news summary dataset from repo, download using script in sample (
Browse files Browse the repository at this point in the history
Azure#2250)

* remove news summary dataset from repo, download using script in sample

* fix to cli sample to remove dataset from repo

* typo fix

* formatting
  • Loading branch information
ManojBableshwar authored May 10, 2023
1 parent 4c66992 commit 047e058
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 19 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# import library to parse command line arguments
import argparse, os
import pandas as pd
import os

parser = argparse.ArgumentParser()
# add an argument to specify a dataset name to download
parser.add_argument("--dataset", type=str, default="cnn_dailymail", help="dataset name")
# add an argument to specify the config name of the dataset
parser.add_argument(
"--config_name", type=str, default="3.0.0", help="config name of the dataset"
)
# argument to save a fraction of the dataset
parser.add_argument(
"--fraction", type=float, default=0.05, help="fraction of the dataset to save"
)
# add an argument to specify the directory to download the dataset to
parser.add_argument(
"--download_dir",
type=str,
default="./news-summary-dataset",
help="directory to download the dataset to",
)
args = parser.parse_args()

# create the download directory if it does not exist
if not os.path.exists(args.download_dir):
os.makedirs(args.download_dir)

# import hugging face datasets library
from datasets import load_dataset, get_dataset_split_names

for split in get_dataset_split_names(args.dataset, config_name=args.config_name):
print(f"Loading {split} split of {args.dataset} dataset...")
# load the split of the dataset
dataset = load_dataset(args.dataset, args.config_name, split=split)
# save the split of the dataset to the download directory as json lines file
dataset.select(range(int(dataset.num_rows * args.fraction))).to_json(
os.path.join(args.download_dir, f"{split}.jsonl")
)

train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True)
validation_df = pd.read_json(
os.path.join(args.download_dir, "validation.jsonl"), lines=True
)
# this dataset doesn't have test data, so split the validation_df into test_df and validation_df
test_df = validation_df.sample(frac=0.5, random_state=42)
validation_df.drop(test_df.index, inplace=True)
# drop the id column as it is not needed for fine tuning
train_df.drop(columns=["id"], inplace=True)
validation_df.drop(columns=["id"], inplace=True)
test_df.drop(columns=["id"], inplace=True)


# save 20% of the rows from the dataframes into files with small_ prefix in the ./news-summary-dataset folder
train_df.sample(frac=0.2).to_json(
os.path.join(args.download_dir, "small_train.jsonl"), orient="records", lines=True
)
validation_df.sample(frac=0.2).to_json(
os.path.join(args.download_dir, "small_validation.jsonl"),
orient="records",
lines=True,
)
test_df.sample(frac=0.2).to_json(
os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True
)


# generate sample scoring data
# read ./news-summary-dataset/small_test.jsonl into a pandas dataframe
import pandas as pd
import json

test_df = pd.read_json(
os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True
)
# take 1 random sample
test_df = test_df.sample(n=1)
# rebuild index
test_df.reset_index(drop=True, inplace=True)
# rename the highlights column to ground_truth_summary
test_df.rename(columns={"highlights": "ground_truth_summary"}, inplace=True)
# create a json object with the key as "inputs" and value as a list of values from the article column of the test dataframe
test_json = {"inputs": {"input_string": test_df["article"].tolist()}}
# save the json object to a file named sample_score.json in the ./emotion-dataset folder
with open(os.path.join(args.download_dir, "sample_score.json"), "w") as f:
json.dump(test_json, f)
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ set -x
# the data files are available in the same folder as the above notebook

# script inputs
subscription_id="<SUBSCRIPTION_ID>"
resource_group_name="<RESOURCE_GROUP>"
workspace_name="<WORKSPACE_NAME>"
subscription_id="21d8f407-c4c4-452e-87a4-e609bfb86248" #"<SUBSCRIPTION_ID>"
resource_group_name="rg-contoso-819prod" #"<RESOURCE_GROUP>",
workspace_name="mlw-contoso-819prod" #"WORKSPACE_NAME>",
registry_name="azureml"

compute_cluster="gpu-cluster-big"
Expand All @@ -27,15 +27,15 @@ deployment_sku="Standard_DS3_v2"


# training data
train_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_train.jsonl"
train_data="./news-summary-dataset/small_train.jsonl"
# validation data
validation_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_validation.jsonl"
validation_data="./news-summary-dataset/small_validation.jsonl"
# test data
test_data="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/small_test.jsonl"
test_data="./news-summary-dataset/small_test.jsonl"
# evaluation config
evaluation_config="../../../../../sdk/python/foundation-models/system/finetune/summarization/summarization-config.json"
evaluation_config="./summarization-config.json"
# scoring_file
scoring_file="../../../../../sdk/python/foundation-models/system/finetune/summarization/news-summary-dataset/sample_score.json"
scoring_file="./news-summary-dataset/sample_score.json"

# finetuning job parameters
finetuning_pipeline_component="summarization_pipeline"
Expand All @@ -49,6 +49,8 @@ number_of_gpu_to_use_finetuning=$gpus_per_node # set to the number of GPUs avail
num_train_epochs=3
learning_rate=2e-5



# 1. Setup pre-requisites

if [ "$subscription_id" = "<SUBSCRIPTION_ID>" ] || \
Expand All @@ -73,6 +75,13 @@ else
}
fi

# download the dataset

python ./download-dataset.py || {
echo "Failed to download dataset"
exit 1
}

# 2. Check if the model exists in the registry
# need to confirm model show command works for registries outside the tenant (aka system registry)
if ! az ml model show --name $model_name --version $model_version --registry-name $registry_name
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"metrics": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
"aggregator": true,
"stemmer": true
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

parser = argparse.ArgumentParser()
# add an argument to specify a dataset name to download
parser.add_argument("--dataset", type=str, default="squad", help="dataset name")
parser.add_argument("--dataset", type=str, default="cnn_dailymail", help="dataset name")
# add an argument to specify the config name of the dataset
parser.add_argument(
"--config_name", type=str, default="plain_text", help="config name of the dataset"
"--config_name", type=str, default="3.0.0", help="config name of the dataset"
)
# argument to save a fraction of the dataset
parser.add_argument(
Expand All @@ -16,7 +16,7 @@
parser.add_argument(
"--download_dir",
type=str,
default="data",
default="./news-summary-dataset",
help="directory to download the dataset to",
)
args = parser.parse_args()
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"This sample shows how to use `summarization` components from the `azureml` system registry to fine tune a model to generate summary of a news article. We then deploy it to an online endpoint for real time inference. The model is trained on tiny sample of the dataset with a small number of epochs to illustrate the fine tuning approach.\n",
"\n",
"### Training data\n",
"We will use the [CNN DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. A copy of this dataset is available in the [news-summary-dataset](./news-summary-dataset/) folder for easy access. \n",
"We will use the [CNN DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset. \n",
"\n",
"### Model\n",
"Models that can perform the `translation` task are generally good foundation models to fine tune for `summarization`. We will use the `t5-small` model in this notebook. If you opened this notebook from a specific model card, remember to replace the specific model name. Optionally, if you need to fine tune a model that is available on HuggingFace, but not available in `azureml` system registry, you can either [import](https://github.com/Azure/azureml-examples) the model or use the `huggingface_id` parameter instruct the components to pull the model directly from HuggingFace. \n",
Expand Down Expand Up @@ -87,7 +87,7 @@
" credential,\n",
" subscription_id=\"<SUBSCRIPTION_ID>\",\n",
" resource_group_name=\"<RESOURCE_GROUP>\",\n",
" workspace_name=\"<WORKSPACE_NAME>\",\n",
" workspace_name=\"WORKSPACE_NAME>\",\n",
" )\n",
"\n",
"# the models, fine tuning pipelines and environments are available in the AzureML system registry, \"azureml-preview\"\n",
Expand Down Expand Up @@ -175,10 +175,25 @@
"> The [CNN DailyMail](https://huggingface.co/datasets/cnn_dailymail) dataset is larger than 1GB when uncompressed. The [download-dataset.py](./news-summary-dataset/download-dataset.py) has supports downloading a smaller fraction of the dataset. The files in the [](./news-summary-dataset/) folder contain about 3% of the original dataset rows. \n",
"\n",
"A copy of the dataset is available in the [news-summary-dataset](./news-summary-dataset/) folder. \n",
"* Download the dataset.\n",
"* Visualize some data rows. \n",
"* We want this sample to run quickly, so save smaller `train`, `validation` and `test` files containing 20% of the already trimmed rows. This means the fine tuned model will have lower accuracy, hence it should not be put to real-world use. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# download the dataset using the helper script. This needs datasets library: https://pypi.org/project/datasets/\n",
"import os\n",
"\n",
"exit_status = os.system(\"python ./download-dataset.py\")\n",
"if exit_status != 0:\n",
" raise Exception(\"Error downloading dataset\")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -193,7 +208,9 @@
"# load the train.jsonl, test.jsonl and validation.jsonl files from the ./news-summary-dataset/ folder and show first 5 rows\n",
"train_df = pd.read_json(\"./news-summary-dataset/train.jsonl\", lines=True)\n",
"validation_df = pd.read_json(\"./news-summary-dataset/validation.jsonl\", lines=True)\n",
"test_df = pd.read_json(\"./news-summary-dataset/test.jsonl\", lines=True)\n",
"# this dataset doesn't have test data, so split the validation_df into test_df and validation_df\n",
"test_df = validation_df.sample(frac=0.5, random_state=42)\n",
"validation_df.drop(test_df.index, inplace=True)\n",
"# drop the id column as it is not needed for fine tuning\n",
"train_df.drop(columns=[\"id\"], inplace=True)\n",
"validation_df.drop(columns=[\"id\"], inplace=True)\n",
Expand Down Expand Up @@ -504,10 +521,8 @@
"import json\n",
"\n",
"test_df = pd.read_json(\n",
" \"./news-summary-dataset/test.jsonl\", orient=\"records\", lines=True\n",
" \"./news-summary-dataset/small_test.jsonl\", orient=\"records\", lines=True\n",
")\n",
"# drop the id column\n",
"test_df.drop(columns=[\"id\"], inplace=True)\n",
"# take 1 random sample\n",
"test_df = test_df.sample(n=1)\n",
"# rebuild index\n",
Expand Down Expand Up @@ -605,7 +620,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
"version": "3.8.13"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 047e058

Please sign in to comment.