franksyng
diff --git a/‎.DS_Store
0 Bytes b/‎.DS_Store
0 Bytes
diff --git a/‎__pycache__/model.cpython-39.pyc
5.96 KB b/‎__pycache__/model.cpython-39.pyc
5.96 KB
diff --git a/‎__pycache__/operations.cpython-39.pyc
3.42 KB b/‎__pycache__/operations.cpython-39.pyc
3.42 KB
diff --git a/‎__pycache__/run.cpython-39.pyc
2.89 KB b/‎__pycache__/run.cpython-39.pyc
2.89 KB
diff --git a/‎__pycache__/utils.cpython-39.pyc
8.7 KB b/‎__pycache__/utils.cpython-39.pyc
8.7 KB
diff --git a/‎README.md ‎archive/README.md b/‎README.md ‎archive/README.md
diff --git a/‎archive/model.py
+145 b/‎archive/model.py
+145
diff --git a/‎playground.py ‎archive/playground.py b/‎playground.py ‎archive/playground.py
diff --git a/‎archive/run.py
+207 b/‎archive/run.py
+207
diff --git a/‎archive/subplayground.py
+9 b/‎archive/subplayground.py
+9
@@ -0,0 +1,145 @@
+import tokenizers.decoders
+from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
+import torch.nn as nn
+import torch
+from transformers import AutoTokenizer, AutoModel, BertForTokenClassification
+
+
+class SentenceGetter(object):
+
+    def __init__(self, dataset):
+        self.n_sent = 1
+        self.dataset = dataset
+        self.empty = False
+        agg_func = lambda s: [(w, t) for w, t in zip(s["word"].values.tolist(), s["tag"].values.tolist())]
+        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
+        self.sentences = [s for s in self.grouped]
+
+    def get_next(self):
+        try:
+            s = self.grouped["Sentence: {}".format(self.n_sent)]
+            self.n_sent += 1
+            return s
+        except:
+            return None
+
+
+class CustomDataset(Dataset):
+    def __init__(self, tokenizer, sentences, labels, max_len):
+        self.len = len(sentences)
+        self.sentences = sentences
+        self.labels = labels
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+
+    def __getitem__(self, index):
+        sentence = str(self.sentences[index])
+        label = self.labels[index]
+        new_label = []
+        previous_word_idx = None
+        tokenized_inputs = self.tokenizer(sentence.split(' '), truncation=True, max_length=self.max_len,
+                                          padding='max_length', is_split_into_words=True)
+        decode_tkn = self.tokenizer.convert_ids_to_tokens(tokenized_inputs['input_ids'])
+        word_ids = tokenized_inputs.word_ids()  # Map tokens to their respective word.
+        for word_idx in word_ids:  # Set the special tokens to -100.
+            if word_idx is None:
+                new_label.append(-100)
+            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
+                new_label.append(label[word_idx])
+            else:
+                new_label.append(label[previous_word_idx])
+            previous_word_idx = word_idx
+        ids = tokenized_inputs['input_ids']
+        mask = tokenized_inputs['attention_mask']
+        return {
+            'ids': torch.tensor(ids, dtype=torch.long),
+            'mask': torch.tensor(mask, dtype=torch.long),
+            'tags': torch.tensor(new_label, dtype=torch.long),
+            'tkn_result': decode_tkn
+        }
+
+    def __len__(self):
+        return self.len
+
+
+class BERT(nn.Module):
+    def __init__(self):
+        super(BERT, self).__init__()
+        self.layer = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT').eval()
+        # self.layer = AutoModel.from_pretrained('transformersbook/bert-base-uncased-finetuned-clinc', num_labels=278)
+        # self.layer = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
+        self.fc = nn.Linear(768, 287)
+        self.relu = nn.ReLU()
+        self.dropout = torch.nn.Dropout(0.3)
+
+    def forward(self, ids, mask, labels=None):
+        with torch.no_grad():
+            output = self.layer(ids, mask)[0]
+        output = self.dropout(output)
+        output = self.fc(output)
+        output = self.relu(output)
+        return output
+
+
+class BERT_LSTM_CNN(nn.Module):
+    def __init__(self, hdim=768):
+        super(BERT_LSTM_CNN, self).__init__()
+        self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.feature_num = 287
+        self.bert = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT').eval()
+        self.lstm = nn.LSTM(768, hdim, batch_first=True)
+        self.conv = nn.Sequential(
+            nn.Conv1d(hdim, 287, 3, padding=1),
+            nn.ReLU(),
+        )
+        # self.maxpool = nn.MaxPool1d(kernel_size=3)
+
+    def init_hidden(self, batch_size):
+        return (torch.randn(2, batch_size, self.hidden_dim // 2).to(self.dev),
+              torch.randn(2, batch_size, self.hidden_dim // 2).to(self.dev))
+
+    def forward(self, ids, mask, labels=None):
+        with torch.no_grad():
+            embeds = self.bert(ids, mask)[0]  # [32, maxlen, 768]
+        lstm_out, _ = self.lstm(embeds)  # [32, maxlen, 768]
+        conv_out = self.conv(torch.permute(lstm_out, (0, 2, 1)))
+        return torch.permute(conv_out, (0, 2, 1))
+        # print(conv_out.size())
+        # maxpool = self.maxpool(conv_out)
+        # print(maxpool.size())
+
+
+class LSTM_CLS(nn.Module):
+    def __init__(self, class_num, hidden_dim=768):
+        super(LSTM_CLS, self).__init__()
+        self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.hidden_dim = hidden_dim
+        self.target_num = class_num
+        self.scibert = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT').to(self.dev)
+        self.lstm = nn.LSTM(768, hidden_dim // 2, num_layers=1, bidirectional=True, batch_first=True)
+
+        # Maps the output of the LSTM into tag space.
+        self.fc = nn.Linear(hidden_dim, self.target_num)
+        self.hidden = self.init_hidden(batch_size=32)
+
+    def init_hidden(self, batch_size):
+        return (torch.randn(2, batch_size, self.hidden_dim // 2).to(self.dev),
+              torch.randn(2, batch_size, self.hidden_dim // 2).to(self.dev))
+
+    def _bert_enc(self, ids, mask):
+        """
+        x: [batchsize, sent_len]
+        enc: [batch_size, sent_len, 768]
+        """
+        with torch.no_grad():
+            enc = self.scibert(ids, mask)[0]
+        return enc
+
+    def forward(self, ids, mask, batch_size):
+        self.hidden = self.init_hidden(batch_size=batch_size)
+        embeds = self._bert_enc(ids, mask)
+        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
+        outputs = self.fc(lstm_out)
+        return outputs
+
+
@@ -0,0 +1,207 @@
+import argparse
+import os
+import utils
+from model import *
+import torch.nn as nn
+import torch.nn.functional
+from transformers import AutoTokenizer
+# from seqeval.metrics import f1_score
+import numpy as np
+from sys import stdout
+from sklearn.metrics import classification_report
+
+
+def cal_accuracy(preds, label_ids, mask):
+    valid_len = np.sum(mask)
+    flat_preds = preds.to('cpu').numpy().flatten()[:valid_len]
+    flat_labels = label_ids.flatten()[:valid_len]
+    acc = classification_report(flat_labels, flat_preds, output_dict=True)['accuracy']
+    new_labels = [i for i in flat_labels if i != 286]
+    new_labels = list(dict.fromkeys(new_labels))
+    target_names = [str(i) for i in new_labels]
+    ner_f1 = classification_report(flat_labels, flat_preds, labels=new_labels, target_names=target_names, output_dict=True)['f1-score']
+    return acc, ner_f1
+
+
+def validation(model, testing_loader, model_name='LSTM_CLS', batch_size=None):
+    model.eval()
+    eval_loss = 0
+    eval_accuracy = 0
+    eval_ner_acc = 0
+    n_correct = 0
+    n_wrong = 0
+    total = 0
+    predictions, true_labels = [], []
+    nb_eval_steps, nb_eval_examples = 0, 0
+    with torch.no_grad():
+        for _, data in enumerate(testing_loader, 0):
+            ids = data['ids'].to(dev, dtype=torch.long)
+            mask = data['mask'].to(dev, dtype=torch.long)
+            targets = data['tags'].to(dev, dtype=torch.long)
+            if model_name == 'LSTM_CLS':
+                output = model(ids, mask, batch_size)
+            else:
+                output = model(ids, mask)
+            loss = criterion(torch.transpose(output, 1, 2), targets)
+            preds = nn.functional.softmax(output, dim=2)
+            preds = torch.argmax(preds, dim=2)
+            label_ids = targets.to('cpu').numpy()
+            true_labels.append(label_ids)
+            accuracy, ner_accuracy = cal_accuracy(preds, label_ids, mask.to('cpu').numpy())
+            eval_loss += loss.mean().item()
+            eval_accuracy += accuracy
+            eval_ner_acc += ner_accuracy
+            nb_eval_examples += ids.size(0)
+            nb_eval_steps += 1
+        eval_loss = eval_loss/nb_eval_steps
+        stdout.write("Validation loss: {}\n".format(eval_loss))
+        stdout.write("Validation Accuracy: {}\n".format(eval_accuracy/nb_eval_steps))
+        stdout.write("Validation NER f1-score: {}\n".format(eval_ner_acc / nb_eval_steps))
+        stdout.flush()
+        # pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
+        # valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
+        # print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
+
+
+def train(epoch_num, batch_size, model_name='LSTM_CLS'):
+    best_avg_loss = 10
+    best_epoch = 0
+    for epoch in range(epoch_num):
+        model.train()
+        cumulative_loss = []
+        curr_avg_loss = 0
+        for i, data in enumerate(training_loader, 0):
+            iter_total = len(training_loader)
+            ids = data['ids'].to(dev, dtype=torch.long)
+            mask = data['mask'].to(dev, dtype=torch.long)
+            targets = data['tags'].to(dev, dtype=torch.long)  # [32, 200]
+            model.zero_grad()
+            if model_name == 'LSTM_CLS':
+                output = model(ids, mask, batch_size)
+            else:
+                output = model(ids, mask)
+            loss = criterion(torch.transpose(output, 1, 2), targets)
+            curr_loss = loss.item()
+            cumulative_loss.append(curr_loss)
+            curr_avg_loss = sum(cumulative_loss) / len(cumulative_loss)
+            if i == 0:
+                stdout.write(f'======== {model_name}: Starting epoch {epoch} ========\n')
+                stdout.write(f'[{i + 1}/{iter_total}] - initial loss:  {loss.item()}\n')
+            elif (i + 1) % batch_size == 0:
+                # stdout.write(f'[{i + 1}/{iter_total}] - loss:  {loss.item()} ({curr_avg_loss})\n')
+                stdout.write(f'[{i + 1}/{iter_total}] - loss: {curr_avg_loss}\n')
+            stdout.flush()
+            loss.backward()
+            optimizer.step()
+        scheduler.step()
+        if curr_avg_loss < best_avg_loss:
+            best_avg_loss = curr_avg_loss
+            best_epoch = epoch
+            torch.save(model, os.path.join(root, "checkpoint/best_model.pt"))
+        stdout.write(f'Epoch {epoch} finished - avg. loss: {curr_avg_loss}, best epoch: {best_epoch}, best loss: {best_avg_loss}\n')
+        stdout.flush()
+        validation(model, testing_loader, model_name, int(batch_size/2))
+        # xm.optimizer_step(optimizer)
+        # xm.mark_step()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--mode')
+    parser.add_argument('--ckpt', default=None)
+    parser.add_argument('--model', default='LSTM_CLS')
+    parser.add_argument('--epoch', default=30)
+    parser.add_argument('--batch_size', default=1)
+    parser.add_argument('--max_len', default=250)
+    parser.add_argument('--lr', default=0.001)
+
+    args = parser.parse_args()
+
+    root = ''
+    data_root = 'data'
+    data_path = os.path.join(data_root, 'train.csv')
+    pn_path = os.path.join(data_root, 'patient_notes.csv')
+    feature_path = os.path.join(data_root, 'features.csv')
+    preprocessor = utils.Preprocessor(data_path, pn_path, feature_path)
+    dataset = preprocessor.to_dataframe()
+    getter = SentenceGetter(dataset)
+    dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # dev = xm.xla_device()
+
+    # ========= Tag to idx ========== #
+    tag2idx = preprocessor.make_vocab()
+    sentences = [' '.join([s[0] for s in sent]) for sent in getter.sentences]
+    # sentences = [s[0] for sent in getter.sentences for s in sent]
+    # print(sentences)
+    labels = [[s[1] for s in sent] for sent in getter.sentences]
+    labels = [[tag2idx.get(l) for l in lab] for lab in labels]
+
+    # ========= Training variables ========== #
+    MAX_LEN = 250
+    TRAIN_BATCH_SIZE = 32
+    VALID_BATCH_SIZE = 16
+    EPOCHS = 30
+    LEARNING_RATE = 0.001
+    # tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+    # tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
+    # tokenizer = AutoTokenizer.from_pretrained('transformersbook/bert-base-uncased-finetuned-clinc')
+    tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
+
+    # ========= Creating the dataset and dataloader for the neural network ========== #
+    train_percent = 0.8
+    train_size = int(train_percent * len(sentences))
+    # train_dataset=df.sample(frac=train_size,random_state=200).reset_index(drop=True)
+    # test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
+    train_sentences = sentences[0:train_size]
+    # print(train_sentences)
+    train_labels = labels[0:train_size]
+
+    test_sentences = sentences[train_size:]
+    test_labels = labels[train_size:]
+
+    print("FULL Dataset: {}".format(len(sentences)))
+    print("TRAIN Dataset: {}".format(len(train_sentences)))
+    print("TEST Dataset: {}".format(len(test_sentences)))
+
+    training_set = CustomDataset(tokenizer, train_sentences, train_labels, MAX_LEN)
+    testing_set = CustomDataset(tokenizer, test_sentences, test_labels, MAX_LEN)
+
+    # ========= Parameters ========== #
+    train_params = {'batch_size': TRAIN_BATCH_SIZE,
+                    'shuffle': True,
+                    'num_workers': 0
+                    }
+
+    test_params = {'batch_size': VALID_BATCH_SIZE,
+                   'shuffle': True,
+                   'num_workers': 0
+                   }
+
+    training_loader = DataLoader(training_set, **train_params)
+    testing_loader = DataLoader(training_set, **test_params)
+
+    # ========= Char embedding ========== #
+    embeds = utils.char_embedding(training_loader)
+
+    # optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
+    criterion = nn.CrossEntropyLoss()
+
+    if args.mode == 'train':
+        if args.model == 'BERT':
+            model = BERT()
+        elif args.model == 'BERT_LSTM_CNN':
+            model = BERT_LSTM_CNN()
+        elif args.model == 'LSTM_CLS':
+            model = LSTM_CLS(287)
+        else:
+            model = None
+        model.to(dev)
+        # optimizer = torch.optim.SGD(params=model.parameters(), lr=LEARNING_RATE, momentum=0.9, weight_decay=0.9)
+        optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)
+        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, EPOCHS)
+        train(EPOCHS, TRAIN_BATCH_SIZE, args.model)
+    elif args.mode == 'test':
+        model_path = args.ckpt
+        # model = torch.load(model_path, map_location=torch.device('cpu'))
+        model = torch.load(model_path)
+        validation(model, testing_loader)
@@ -0,0 +1,9 @@
+import torch
+import torch as F
+import numpy as np
+from sklearn.metrics import classification_report
+
+x = [1,2,3,4,5,6,7,8,9,10]
+print(x[0:2])
+print(x[2:4])
+print(x[4:10])