Skip to content

Commit c60a58d

Browse files
committed
init commit
1 parent e3e341f commit c60a58d

File tree

9 files changed

+77852
-0
lines changed

9 files changed

+77852
-0
lines changed

README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,22 @@
11
# LS-LLaMA
22
LABEL SUPERVISED LLAMA FINETUNING
3+
4+
5+
## Usage
6+
7+
Load Pretrained Models
8+
9+
```python
10+
from transformers import AutoTokenizer
11+
from modeling_llama import (
12+
LlamaForSequenceClassification, LlamaForTokenClassification,
13+
UnmaskingLlamaForSequenceClassification, UnmaskingLlamaForTokenClassification,
14+
)
15+
16+
17+
model_id = 'meta-llama/Llama-2-7b'
18+
tokenizer = AutoTokenizer.from_pretrained(model_id)
19+
model = LlamaForSequenceClassification.from_pretrained(model_id)
20+
```
21+
22+
More usage please refer to `unllama_seq_clf.py`, `unllama_token_clf.py`, `llama_seq_clf.py`, `llama_token_clf.py`.

data/ontonotesv5/dev.jsonl

Lines changed: 8528 additions & 0 deletions
Large diffs are not rendered by default.

data/ontonotesv5/test.jsonl

Lines changed: 8262 additions & 0 deletions
Large diffs are not rendered by default.

data/ontonotesv5/train.jsonl

Lines changed: 59924 additions & 0 deletions
Large diffs are not rendered by default.

llama_seq_clf.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import sys
4+
from typing import List, Any, Dict
5+
from datasets import load_dataset
6+
from transformers import AutoTokenizer
7+
from transformers.data import *
8+
from transformers import TrainingArguments, Trainer
9+
from peft import get_peft_model, LoraConfig, TaskType
10+
import evaluate
11+
import numpy as np
12+
13+
from modeling_llama import LlamaForSequenceClassification
14+
15+
16+
if len(sys.argv) != 3:
17+
print('usage python %.py dataset model_size')
18+
sys.exit()
19+
20+
21+
dataset, model_size = sys.argv[1], sys.argv[2]
22+
epochs = 10
23+
batch_size = 8
24+
learning_rate = 5e-5
25+
lora_r = 12
26+
max_length = 64
27+
if model_size.lower() == '7b':
28+
model_id = 'NousResearch/Llama-2-7b-hf'
29+
elif model_size.lower() == '13b':
30+
model_id = 'NousResearch/Llama-2-13b-hf'
31+
32+
test_name = 'test'
33+
text_name = None
34+
if dataset == 'agnews':
35+
id2label = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
36+
label2id = {v: k for k, v in id2label.items()}
37+
ds = load_dataset("ag_news")
38+
text_name = 'text'
39+
elif dataset == 'twitterfin':
40+
id2label = {0: "Bearish", 1: "Bullish", 2: "Neutral"}
41+
label2id = {v: k for k, v in id2label.items()}
42+
ds = load_dataset("zeroshot/twitter-financial-news-sentiment")
43+
test_name = 'validation'
44+
text_name = 'text'
45+
elif dataset == 'sst2':
46+
id2label = {0: "negative", 1: "positive"}
47+
label2id = {v: k for k, v in id2label.items()}
48+
ds = load_dataset("sst2")
49+
test_name = 'validation'
50+
text_name = 'sentence'
51+
elif dataset in ['amazon_de', 'amazon_en', 'amazon_es', 'amazon_fr', 'amazon_ja', 'amazon_zh']:
52+
max_length = 200
53+
batch_size = 4
54+
lang = dataset.split('_')[1]
55+
id2label = {0: 'furniture', 1: 'baby_product', 2: 'jewelry', 3: 'musical_instruments', 4: 'industrial_supplies', 5: 'pc', 6: 'other', 7: 'pet_products', 8: 'book', 9: 'apparel', 10: 'automotive', 11: 'digital_video_download', 12: 'beauty', 13: 'toy', 14: 'shoes', 15: 'personal_care_appliances', 16: 'camera', 17: 'digital_ebook_purchase', 18: 'watch', 19: 'drugstore', 20: 'grocery', 21: 'kitchen', 22: 'home', 23: 'office_product', 24: 'home_improvement', 25: 'electronics', 26: 'video_games', 27: 'sports', 28: 'luggage', 29: 'lawn_and_garden', 30: 'wireless'}
56+
label2id = {v: k for k, v in id2label.items()}
57+
ds = load_dataset("amazon_reviews_multi", lang)
58+
ds = ds.rename_column('product_category', 'label')
59+
text_name = ['review_title', 'review_body']
60+
# reimplement DataCollatorWithPaddingAmazon
61+
class DataCollatorWithPaddingAmazon(DataCollatorWithPadding):
62+
def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
63+
# print('>>> features>>>', features)
64+
new_features = []
65+
for v in features:
66+
label = v.pop('label')
67+
v['label'] = label2id[label]
68+
new_features.append(v)
69+
features = new_features
70+
batch = self.tokenizer.pad(
71+
features,
72+
padding=self.padding,
73+
max_length=self.max_length,
74+
pad_to_multiple_of=self.pad_to_multiple_of,
75+
return_tensors=self.return_tensors,
76+
)
77+
if "label" in batch:
78+
batch["labels"] = batch["label"]
79+
del batch["label"]
80+
if "label_ids" in batch:
81+
batch["labels"] = batch["label_ids"]
82+
del batch["label_ids"]
83+
return batch
84+
85+
DataCollatorWithPadding = DataCollatorWithPaddingAmazon
86+
else:
87+
raise NotImplementedError
88+
89+
accuracy = evaluate.load("accuracy")
90+
tokenizer = AutoTokenizer.from_pretrained(model_id)
91+
model = LlamaForSequenceClassification.from_pretrained(
92+
model_id, num_labels=len(label2id), id2label=id2label, label2id=label2id
93+
).bfloat16()
94+
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=lora_r, lora_alpha=32, lora_dropout=0.1)
95+
model = get_peft_model(model, peft_config)
96+
model.print_trainable_parameters()
97+
98+
99+
def compute_metrics(eval_pred):
100+
predictions, labels = eval_pred
101+
predictions = np.argmax(predictions, axis=1)
102+
return accuracy.compute(predictions=predictions, references=labels)
103+
104+
105+
def preprocess_function(examples):
106+
global text_name
107+
if isinstance(text_name, str):
108+
d = examples[text_name]
109+
else:
110+
d = examples[text_name[0]]
111+
for n in text_name[1:]:
112+
nd = examples[n]
113+
assert len(d) == len(nd)
114+
for i, t in enumerate(nd):
115+
d[i] += '\n' + t
116+
117+
return tokenizer(d, padding='longest', max_length=max_length, truncation=True)
118+
119+
120+
tokenized_ds = ds.map(preprocess_function, batched=True)
121+
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
122+
123+
124+
training_args = TrainingArguments(
125+
output_dir="clf",
126+
learning_rate=learning_rate,
127+
per_device_train_batch_size=batch_size,
128+
per_device_eval_batch_size=batch_size,
129+
num_train_epochs=epochs,
130+
weight_decay=0.01,
131+
evaluation_strategy="epoch",
132+
save_strategy="no",
133+
load_best_model_at_end=False,
134+
push_to_hub=False,
135+
)
136+
137+
trainer = Trainer(
138+
model=model,
139+
args=training_args,
140+
train_dataset=tokenized_ds["train"],
141+
eval_dataset=tokenized_ds[test_name],
142+
tokenizer=tokenizer,
143+
data_collator=data_collator,
144+
compute_metrics=compute_metrics,
145+
)
146+
147+
trainer.train()

llama_token_clf.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import json
4+
import sys
5+
import numpy as np
6+
import evaluate
7+
from datasets import load_dataset, Dataset, DatasetDict
8+
from transformers import AutoTokenizer
9+
from transformers import DataCollatorForTokenClassification
10+
from transformers import TrainingArguments, Trainer
11+
from peft import get_peft_model, LoraConfig, TaskType
12+
13+
from modeling_llama import LlamaForTokenClassification
14+
15+
16+
def load_ontonotesv5():
17+
ret = {}
18+
for split_name in ['train', 'dev', 'test']:
19+
data = []
20+
with open(f'./data/NER/ontonotesv5/{split_name}.jsonl', 'r') as reader:
21+
for line in reader:
22+
data.append(json.loads(line))
23+
ret[split_name] = Dataset.from_list(data)
24+
return DatasetDict(ret)
25+
26+
27+
if len(sys.argv) != 3:
28+
print('usage python %.py task model_size')
29+
sys.exit()
30+
31+
task, model_size = sys.argv[1], sys.argv[2].lower()
32+
print(f'handling task {task}')
33+
34+
epochs = 10
35+
batch_size = 8
36+
learning_rate = 1e-4
37+
max_length = 64
38+
if model_size == '7b':
39+
model_id = 'NousResearch/Llama-2-7b-hf'
40+
lora_r = 12
41+
elif model_size == '13b':
42+
model_id = 'NousResearch/Llama-2-13b-hf'
43+
lora_r = 12
44+
else:
45+
raise NotImplementedError
46+
tokenizer = AutoTokenizer.from_pretrained(model_id)
47+
seqeval = evaluate.load("seqeval")
48+
if task == 'wnut_17':
49+
ds = load_dataset("wnut_17")
50+
label2id = { "O": 0, "B-corporation": 1, "I-corporation": 2, "B-creative-work": 3, "I-creative-work": 4, "B-group": 5, "I-group": 6, "B-location": 7, "I-location": 8, "B-person": 9, "I-person": 10, "B-product": 11, "I-product": 12, }
51+
elif task == 'conll2003':
52+
ds = load_dataset("conll2003")
53+
label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
54+
elif task == 'ontonotesv5':
55+
ds = load_ontonotesv5()
56+
label2id = {'O': 0, 'B-NORP': 1, 'B-PERSON': 2, 'B-WORK_OF_ART': 3, 'B-QUANTITY': 4, 'B-EVENT': 5, 'B-DATE': 6, 'B-TIME': 7, 'B-PERCENT': 8, 'B-LANGUAGE': 9, 'B-ORG': 10, 'B-CARDINAL': 11, 'B-LAW': 12, 'B-GPE': 13, 'B-PRODUCT': 14, 'B-LOC': 15, 'B-MONEY': 16, 'B-ORDINAL': 17, 'B-FAC': 18}
57+
else:
58+
raise NotImplementedError
59+
id2label = {v: k for k, v in label2id.items()}
60+
label_list = list(label2id.keys()) # ds["train"].features[f"ner_tags"].feature.names
61+
model = LlamaForTokenClassification.from_pretrained(
62+
model_id, num_labels=len(label2id), id2label=id2label, label2id=label2id
63+
).bfloat16()
64+
peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=lora_r, lora_alpha=32, lora_dropout=0.1)
65+
model = get_peft_model(model, peft_config)
66+
model.print_trainable_parameters()
67+
68+
69+
def tokenize_and_align_labels(examples):
70+
tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, padding='longest', max_length=max_length, truncation=True)
71+
72+
labels = []
73+
for i, label in enumerate(examples[f"ner_tags"]):
74+
word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.
75+
previous_word_idx = None
76+
label_ids = []
77+
for word_idx in word_ids: # Set the special tokens to -100.
78+
if word_idx is None:
79+
label_ids.append(-100)
80+
elif word_idx != previous_word_idx: # Only label the first token of a given word.
81+
label_ids.append(label[word_idx])
82+
else:
83+
label_ids.append(-100)
84+
previous_word_idx = word_idx
85+
labels.append(label_ids)
86+
87+
tokenized_inputs["labels"] = labels
88+
return tokenized_inputs
89+
90+
91+
tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
92+
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
93+
94+
95+
def compute_metrics(p):
96+
predictions, labels = p
97+
predictions = np.argmax(predictions, axis=2)
98+
99+
true_predictions = [
100+
[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
101+
for prediction, label in zip(predictions, labels)
102+
]
103+
true_labels = [
104+
[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
105+
for prediction, label in zip(predictions, labels)
106+
]
107+
108+
results = seqeval.compute(predictions=true_predictions, references=true_labels)
109+
return {
110+
"precision": results["overall_precision"],
111+
"recall": results["overall_recall"],
112+
"f1": results["overall_f1"],
113+
"accuracy": results["overall_accuracy"],
114+
}
115+
116+
117+
training_args = TrainingArguments(
118+
output_dir="my_awesome_ds_model",
119+
learning_rate=learning_rate,
120+
per_device_train_batch_size=batch_size,
121+
per_device_eval_batch_size=batch_size,
122+
num_train_epochs=epochs,
123+
weight_decay=0.01,
124+
evaluation_strategy="epoch",
125+
save_strategy="epoch",
126+
load_best_model_at_end=True,
127+
push_to_hub=False,
128+
)
129+
130+
trainer = Trainer(
131+
model=model,
132+
args=training_args,
133+
train_dataset=tokenized_ds["train"],
134+
eval_dataset=tokenized_ds["test"],
135+
tokenizer=tokenizer,
136+
data_collator=data_collator,
137+
compute_metrics=compute_metrics,
138+
)
139+
140+
trainer.train()

0 commit comments

Comments
 (0)