Skip to content

Commit 735acc8

Browse files
committed
update baseline
1 parent 391d53d commit 735acc8

18 files changed

+141561
-1240
lines changed

.DS_Store

0 Bytes
Binary file not shown.

__pycache__/model.cpython-39.pyc

5.96 KB
Binary file not shown.

__pycache__/operations.cpython-39.pyc

3.42 KB
Binary file not shown.

__pycache__/run.cpython-39.pyc

2.89 KB
Binary file not shown.

__pycache__/utils.cpython-39.pyc

8.7 KB
Binary file not shown.

README.md archive/README.md

File renamed without changes.

archive/model.py

+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import tokenizers.decoders
2+
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
3+
import torch.nn as nn
4+
import torch
5+
from transformers import AutoTokenizer, AutoModel, BertForTokenClassification
6+
7+
8+
class SentenceGetter(object):
9+
10+
def __init__(self, dataset):
11+
self.n_sent = 1
12+
self.dataset = dataset
13+
self.empty = False
14+
agg_func = lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(), s["tag"].values.tolist())]
15+
self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
16+
self.sentences = [s for s in self.grouped]
17+
18+
def get_next(self):
19+
try:
20+
s = self.grouped["Sentence: {}".format(self.n_sent)]
21+
self.n_sent += 1
22+
return s
23+
except:
24+
return None
25+
26+
27+
class CustomDataset(Dataset):
28+
def __init__(self, tokenizer, sentences, labels, max_len):
29+
self.len = len(sentences)
30+
self.sentences = sentences
31+
self.labels = labels
32+
self.tokenizer = tokenizer
33+
self.max_len = max_len
34+
35+
def __getitem__(self, index):
36+
sentence = str(self.sentences[index])
37+
label = self.labels[index]
38+
new_label = []
39+
previous_word_idx = None
40+
tokenized_inputs = self.tokenizer(sentence.split(' '), truncation=True, max_length=self.max_len,
41+
padding='max_length', is_split_into_words=True)
42+
decode_tkn = self.tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids'])
43+
word_ids = tokenized_inputs.word_ids() # Map tokens to their respective word.
44+
for word_idx in word_ids: # Set the special tokens to -100.
45+
if word_idx is None:
46+
new_label.append(-100)
47+
elif word_idx != previous_word_idx: # Only label the first token of a given word.
48+
new_label.append(label[word_idx])
49+
else:
50+
new_label.append(label[previous_word_idx])
51+
previous_word_idx = word_idx
52+
ids = tokenized_inputs['input_ids']
53+
mask = tokenized_inputs['attention_mask']
54+
return {
55+
'ids': torch.tensor(ids, dtype=torch.long),
56+
'mask': torch.tensor(mask, dtype=torch.long),
57+
'tags': torch.tensor(new_label, dtype=torch.long),
58+
'tkn_result': decode_tkn
59+
}
60+
61+
def __len__(self):
62+
return self.len
63+
64+
65+
class BERT(nn.Module):
66+
def __init__(self):
67+
super(BERT, self).__init__()
68+
self.layer = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT').eval()
69+
# self.layer = AutoModel.from_pretrained('transformersbook/bert-base-uncased-finetuned-clinc', num_labels=278)
70+
# self.layer = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
71+
self.fc = nn.Linear(768, 287)
72+
self.relu = nn.ReLU()
73+
self.dropout = torch.nn.Dropout(0.3)
74+
75+
def forward(self, ids, mask, labels=None):
76+
with torch.no_grad():
77+
output = self.layer(ids, mask)[0]
78+
output = self.dropout(output)
79+
output = self.fc(output)
80+
output = self.relu(output)
81+
return output
82+
83+
84+
class BERT_LSTM_CNN(nn.Module):
85+
def __init__(self, hdim=768):
86+
super(BERT_LSTM_CNN, self).__init__()
87+
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
88+
self.feature_num = 287
89+
self.bert = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT').eval()
90+
self.lstm = nn.LSTM(768, hdim, batch_first=True)
91+
self.conv = nn.Sequential(
92+
nn.Conv1d(hdim, 287, 3, padding=1),
93+
nn.ReLU(),
94+
)
95+
# self.maxpool = nn.MaxPool1d(kernel_size=3)
96+
97+
def init_hidden(self, batch_size):
98+
return (torch.randn(2, batch_size, self.hidden_dim // 2).to(self.dev),
99+
torch.randn(2, batch_size, self.hidden_dim // 2).to(self.dev))
100+
101+
def forward(self, ids, mask, labels=None):
102+
with torch.no_grad():
103+
embeds = self.bert(ids, mask)[0] # [32, maxlen, 768]
104+
lstm_out, _ = self.lstm(embeds) # [32, maxlen, 768]
105+
conv_out = self.conv(torch.permute(lstm_out, (0, 2, 1)))
106+
return torch.permute(conv_out, (0, 2, 1))
107+
# print(conv_out.size())
108+
# maxpool = self.maxpool(conv_out)
109+
# print(maxpool.size())
110+
111+
112+
class LSTM_CLS(nn.Module):
113+
def __init__(self, class_num, hidden_dim=768):
114+
super(LSTM_CLS, self).__init__()
115+
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
116+
self.hidden_dim = hidden_dim
117+
self.target_num = class_num
118+
self.scibert = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT').to(self.dev)
119+
self.lstm = nn.LSTM(768, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
120+
121+
# Maps the output of the LSTM into tag space.
122+
self.fc = nn.Linear(hidden_dim, self.target_num)
123+
self.hidden = self.init_hidden(batch_size=32)
124+
125+
def init_hidden(self, batch_size):
126+
return (torch.randn(2, batch_size, self.hidden_dim // 2).to(self.dev),
127+
torch.randn(2, batch_size, self.hidden_dim // 2).to(self.dev))
128+
129+
def _bert_enc(self, ids, mask):
130+
"""
131+
x: [batchsize, sent_len]
132+
enc: [batch_size, sent_len, 768]
133+
"""
134+
with torch.no_grad():
135+
enc = self.scibert(ids, mask)[0]
136+
return enc
137+
138+
def forward(self, ids, mask, batch_size):
139+
self.hidden = self.init_hidden(batch_size=batch_size)
140+
embeds = self._bert_enc(ids, mask)
141+
lstm_out, self.hidden = self.lstm(embeds, self.hidden)
142+
outputs = self.fc(lstm_out)
143+
return outputs
144+
145+
File renamed without changes.

archive/run.py

+207
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
import argparse
2+
import os
3+
import utils
4+
from model import *
5+
import torch.nn as nn
6+
import torch.nn.functional
7+
from transformers import AutoTokenizer
8+
# from seqeval.metrics import f1_score
9+
import numpy as np
10+
from sys import stdout
11+
from sklearn.metrics import classification_report
12+
13+
14+
def cal_accuracy(preds, label_ids, mask):
15+
valid_len = np.sum(mask)
16+
flat_preds = preds.to('cpu').numpy().flatten()[:valid_len]
17+
flat_labels = label_ids.flatten()[:valid_len]
18+
acc = classification_report(flat_labels, flat_preds, output_dict=True)['accuracy']
19+
new_labels = [i for i in flat_labels if i != 286]
20+
new_labels = list(dict.fromkeys(new_labels))
21+
target_names = [str(i) for i in new_labels]
22+
ner_f1 = classification_report(flat_labels, flat_preds, labels=new_labels, target_names=target_names, output_dict=True)['f1-score']
23+
return acc, ner_f1
24+
25+
26+
def validation(model, testing_loader, model_name='LSTM_CLS', batch_size=None):
27+
model.eval()
28+
eval_loss = 0
29+
eval_accuracy = 0
30+
eval_ner_acc = 0
31+
n_correct = 0
32+
n_wrong = 0
33+
total = 0
34+
predictions, true_labels = [], []
35+
nb_eval_steps, nb_eval_examples = 0, 0
36+
with torch.no_grad():
37+
for _, data in enumerate(testing_loader, 0):
38+
ids = data['ids'].to(dev, dtype=torch.long)
39+
mask = data['mask'].to(dev, dtype=torch.long)
40+
targets = data['tags'].to(dev, dtype=torch.long)
41+
if model_name == 'LSTM_CLS':
42+
output = model(ids, mask, batch_size)
43+
else:
44+
output = model(ids, mask)
45+
loss = criterion(torch.transpose(output, 1, 2), targets)
46+
preds = nn.functional.softmax(output, dim=2)
47+
preds = torch.argmax(preds, dim=2)
48+
label_ids = targets.to('cpu').numpy()
49+
true_labels.append(label_ids)
50+
accuracy, ner_accuracy = cal_accuracy(preds, label_ids, mask.to('cpu').numpy())
51+
eval_loss += loss.mean().item()
52+
eval_accuracy += accuracy
53+
eval_ner_acc += ner_accuracy
54+
nb_eval_examples += ids.size(0)
55+
nb_eval_steps += 1
56+
eval_loss = eval_loss/nb_eval_steps
57+
stdout.write("Validation loss: {}\n".format(eval_loss))
58+
stdout.write("Validation Accuracy: {}\n".format(eval_accuracy/nb_eval_steps))
59+
stdout.write("Validation NER f1-score: {}\n".format(eval_ner_acc / nb_eval_steps))
60+
stdout.flush()
61+
# pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
62+
# valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
63+
# print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
64+
65+
66+
def train(epoch_num, batch_size, model_name='LSTM_CLS'):
67+
best_avg_loss = 10
68+
best_epoch = 0
69+
for epoch in range(epoch_num):
70+
model.train()
71+
cumulative_loss = []
72+
curr_avg_loss = 0
73+
for i, data in enumerate(training_loader, 0):
74+
iter_total = len(training_loader)
75+
ids = data['ids'].to(dev, dtype=torch.long)
76+
mask = data['mask'].to(dev, dtype=torch.long)
77+
targets = data['tags'].to(dev, dtype=torch.long) # [32, 200]
78+
model.zero_grad()
79+
if model_name == 'LSTM_CLS':
80+
output = model(ids, mask, batch_size)
81+
else:
82+
output = model(ids, mask)
83+
loss = criterion(torch.transpose(output, 1, 2), targets)
84+
curr_loss = loss.item()
85+
cumulative_loss.append(curr_loss)
86+
curr_avg_loss = sum(cumulative_loss) / len(cumulative_loss)
87+
if i == 0:
88+
stdout.write(f'======== {model_name}: Starting epoch {epoch} ========\n')
89+
stdout.write(f'[{i + 1}/{iter_total}] - initial loss: {loss.item()}\n')
90+
elif (i + 1) % batch_size == 0:
91+
# stdout.write(f'[{i + 1}/{iter_total}] - loss: {loss.item()} ({curr_avg_loss})\n')
92+
stdout.write(f'[{i + 1}/{iter_total}] - loss: {curr_avg_loss}\n')
93+
stdout.flush()
94+
loss.backward()
95+
optimizer.step()
96+
scheduler.step()
97+
if curr_avg_loss < best_avg_loss:
98+
best_avg_loss = curr_avg_loss
99+
best_epoch = epoch
100+
torch.save(model, os.path.join(root, "checkpoint/best_model.pt"))
101+
stdout.write(f'Epoch {epoch} finished - avg. loss: {curr_avg_loss}, best epoch: {best_epoch}, best loss: {best_avg_loss}\n')
102+
stdout.flush()
103+
validation(model, testing_loader, model_name, int(batch_size/2))
104+
# xm.optimizer_step(optimizer)
105+
# xm.mark_step()
106+
107+
108+
if __name__ == "__main__":
109+
parser = argparse.ArgumentParser()
110+
parser.add_argument('--mode')
111+
parser.add_argument('--ckpt', default=None)
112+
parser.add_argument('--model', default='LSTM_CLS')
113+
parser.add_argument('--epoch', default=30)
114+
parser.add_argument('--batch_size', default=1)
115+
parser.add_argument('--max_len', default=250)
116+
parser.add_argument('--lr', default=0.001)
117+
118+
args = parser.parse_args()
119+
120+
root = ''
121+
data_root = 'data'
122+
data_path = os.path.join(data_root, 'train.csv')
123+
pn_path = os.path.join(data_root, 'patient_notes.csv')
124+
feature_path = os.path.join(data_root, 'features.csv')
125+
preprocessor = utils.Preprocessor(data_path, pn_path, feature_path)
126+
dataset = preprocessor.to_dataframe()
127+
getter = SentenceGetter(dataset)
128+
dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
129+
# dev = xm.xla_device()
130+
131+
# ========= Tag to idx ========== #
132+
tag2idx = preprocessor.make_vocab()
133+
sentences = [' '.join([s[0] for s in sent]) for sent in getter.sentences]
134+
# sentences = [s[0] for sent in getter.sentences for s in sent]
135+
# print(sentences)
136+
labels = [[s[1] for s in sent] for sent in getter.sentences]
137+
labels = [[tag2idx.get(l) for l in lab] for lab in labels]
138+
139+
# ========= Training variables ========== #
140+
MAX_LEN = 250
141+
TRAIN_BATCH_SIZE = 32
142+
VALID_BATCH_SIZE = 16
143+
EPOCHS = 30
144+
LEARNING_RATE = 0.001
145+
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
146+
# tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
147+
# tokenizer = AutoTokenizer.from_pretrained('transformersbook/bert-base-uncased-finetuned-clinc')
148+
tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
149+
150+
# ========= Creating the dataset and dataloader for the neural network ========== #
151+
train_percent = 0.8
152+
train_size = int(train_percent * len(sentences))
153+
# train_dataset=df.sample(frac=train_size,random_state=200).reset_index(drop=True)
154+
# test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
155+
train_sentences = sentences[0:train_size]
156+
# print(train_sentences)
157+
train_labels = labels[0:train_size]
158+
159+
test_sentences = sentences[train_size:]
160+
test_labels = labels[train_size:]
161+
162+
print("FULL Dataset: {}".format(len(sentences)))
163+
print("TRAIN Dataset: {}".format(len(train_sentences)))
164+
print("TEST Dataset: {}".format(len(test_sentences)))
165+
166+
training_set = CustomDataset(tokenizer, train_sentences, train_labels, MAX_LEN)
167+
testing_set = CustomDataset(tokenizer, test_sentences, test_labels, MAX_LEN)
168+
169+
# ========= Parameters ========== #
170+
train_params = {'batch_size': TRAIN_BATCH_SIZE,
171+
'shuffle': True,
172+
'num_workers': 0
173+
}
174+
175+
test_params = {'batch_size': VALID_BATCH_SIZE,
176+
'shuffle': True,
177+
'num_workers': 0
178+
}
179+
180+
training_loader = DataLoader(training_set, **train_params)
181+
testing_loader = DataLoader(training_set, **test_params)
182+
183+
# ========= Char embedding ========== #
184+
embeds = utils.char_embedding(training_loader)
185+
186+
# optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
187+
criterion = nn.CrossEntropyLoss()
188+
189+
if args.mode == 'train':
190+
if args.model == 'BERT':
191+
model = BERT()
192+
elif args.model == 'BERT_LSTM_CNN':
193+
model = BERT_LSTM_CNN()
194+
elif args.model == 'LSTM_CLS':
195+
model = LSTM_CLS(287)
196+
else:
197+
model = None
198+
model.to(dev)
199+
# optimizer = torch.optim.SGD(params=model.parameters(), lr=LEARNING_RATE, momentum=0.9, weight_decay=0.9)
200+
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)
201+
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, EPOCHS)
202+
train(EPOCHS, TRAIN_BATCH_SIZE, args.model)
203+
elif args.mode == 'test':
204+
model_path = args.ckpt
205+
# model = torch.load(model_path, map_location=torch.device('cpu'))
206+
model = torch.load(model_path)
207+
validation(model, testing_loader)

archive/subplayground.py

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import torch
2+
import torch as F
3+
import numpy as np
4+
from sklearn.metrics import classification_report
5+
6+
x = [1,2,3,4,5,6,7,8,9,10]
7+
print(x[0:2])
8+
print(x[2:4])
9+
print(x[4:10])

0 commit comments

Comments
 (0)