forked from Azure/azureml-examples
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
remove news summary dataset from repo, download using script in sample (
Azure#2250) * remove news summary dataset from repo, download using script in sample * fix to cli sample to remove dataset from repo * typo fix * formatting
- Loading branch information
1 parent
4c66992
commit 047e058
Showing
6 changed files
with
134 additions
and
19 deletions.
There are no files selected for viewing
87 changes: 87 additions & 0 deletions
87
cli/foundation-models/system/finetune/summarization/download-dataset.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# import library to parse command line arguments | ||
import argparse, os | ||
import pandas as pd | ||
import os | ||
|
||
parser = argparse.ArgumentParser() | ||
# add an argument to specify a dataset name to download | ||
parser.add_argument("--dataset", type=str, default="cnn_dailymail", help="dataset name") | ||
# add an argument to specify the config name of the dataset | ||
parser.add_argument( | ||
"--config_name", type=str, default="3.0.0", help="config name of the dataset" | ||
) | ||
# argument to save a fraction of the dataset | ||
parser.add_argument( | ||
"--fraction", type=float, default=0.05, help="fraction of the dataset to save" | ||
) | ||
# add an argument to specify the directory to download the dataset to | ||
parser.add_argument( | ||
"--download_dir", | ||
type=str, | ||
default="./news-summary-dataset", | ||
help="directory to download the dataset to", | ||
) | ||
args = parser.parse_args() | ||
|
||
# create the download directory if it does not exist | ||
if not os.path.exists(args.download_dir): | ||
os.makedirs(args.download_dir) | ||
|
||
# import hugging face datasets library | ||
from datasets import load_dataset, get_dataset_split_names | ||
|
||
for split in get_dataset_split_names(args.dataset, config_name=args.config_name): | ||
print(f"Loading {split} split of {args.dataset} dataset...") | ||
# load the split of the dataset | ||
dataset = load_dataset(args.dataset, args.config_name, split=split) | ||
# save the split of the dataset to the download directory as json lines file | ||
dataset.select(range(int(dataset.num_rows * args.fraction))).to_json( | ||
os.path.join(args.download_dir, f"{split}.jsonl") | ||
) | ||
|
||
train_df = pd.read_json(os.path.join(args.download_dir, "train.jsonl"), lines=True) | ||
validation_df = pd.read_json( | ||
os.path.join(args.download_dir, "validation.jsonl"), lines=True | ||
) | ||
# this dataset doesn't have test data, so split the validation_df into test_df and validation_df | ||
test_df = validation_df.sample(frac=0.5, random_state=42) | ||
validation_df.drop(test_df.index, inplace=True) | ||
# drop the id column as it is not needed for fine tuning | ||
train_df.drop(columns=["id"], inplace=True) | ||
validation_df.drop(columns=["id"], inplace=True) | ||
test_df.drop(columns=["id"], inplace=True) | ||
|
||
|
||
# save 20% of the rows from the dataframes into files with small_ prefix in the ./news-summary-dataset folder | ||
train_df.sample(frac=0.2).to_json( | ||
os.path.join(args.download_dir, "small_train.jsonl"), orient="records", lines=True | ||
) | ||
validation_df.sample(frac=0.2).to_json( | ||
os.path.join(args.download_dir, "small_validation.jsonl"), | ||
orient="records", | ||
lines=True, | ||
) | ||
test_df.sample(frac=0.2).to_json( | ||
os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True | ||
) | ||
|
||
|
||
# generate sample scoring data | ||
# read ./news-summary-dataset/small_test.jsonl into a pandas dataframe | ||
import pandas as pd | ||
import json | ||
|
||
test_df = pd.read_json( | ||
os.path.join(args.download_dir, "small_test.jsonl"), orient="records", lines=True | ||
) | ||
# take 1 random sample | ||
test_df = test_df.sample(n=1) | ||
# rebuild index | ||
test_df.reset_index(drop=True, inplace=True) | ||
# rename the highlights column to ground_truth_summary | ||
test_df.rename(columns={"highlights": "ground_truth_summary"}, inplace=True) | ||
# create a json object with the key as "inputs" and value as a list of values from the article column of the test dataframe | ||
test_json = {"inputs": {"input_string": test_df["article"].tolist()}} | ||
# save the json object to a file named sample_score.json in the ./emotion-dataset folder | ||
with open(os.path.join(args.download_dir, "sample_score.json"), "w") as f: | ||
json.dump(test_json, f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5 changes: 5 additions & 0 deletions
5
cli/foundation-models/system/finetune/summarization/summarization-config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{ | ||
"metrics": ["rouge1", "rouge2", "rougeL", "rougeLsum"], | ||
"aggregator": true, | ||
"stemmer": true | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1 change: 0 additions & 1 deletion
1
...on/foundation-models/system/finetune/summarization/news-summary-dataset/sample_score.json
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters