updating readme [skip ci]

ersilia-os · Dec 1, 2023 · 338ee24 · 338ee24
2 parents c2dcb6f + dee0bb8
commit 338ee24
Show file tree

Hide file tree

Showing 9 changed files with 2,346 additions and 19 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -1 +1,4 @@
 mock.txt filter=lfs diff=lfs merge=lfs -text
+*.pkl.gz filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+model/checkpoints/models/ChemBL_atom_encoder.pth filter=lfs diff=lfs merge=lfs -text
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,12 @@
-FROM bentoml/model-server:0.11.0-py37
+FROM bentoml/model-server:0.11.0-py38
 MAINTAINER ersilia
 
 RUN pip install rdkit
+RUN pip install numpy==1.23.1
+RUN pip install fastai==1.0.61
+RUN pip install scikit-learn
+RUN pip install spacy
+
 
 WORKDIR /repo
 COPY . /repo
diff --git a/README.md b/README.md
@@ -1,3 +1,45 @@
-# Ersilia Model In Progress
+# Molecular Prediction Model Fine-Tuning (MolPMoFiT)
 
-This model is work in progress. Please edit the [metadata.json](metadata.json) file to complete the information about the model. This README file will be updated automatically based on the information contained in that folder.
+Using self-supervised learning, the authors pre-trained a large model using one millon unlabelled molecules from ChEMBL. This model can subsequently be fine-tuned for various QSAR tasks. Here, we provide the encodings for the molecular structures using the pre-trained model, not the fine-tuned QSAR models.
+
+## Identifiers
+
+* EOS model ID: `eos9zw0`
+* Slug: `molpmofit`
+
+## Characteristics
+
+* Input: `Compound`
+* Input Shape: `Single`
+* Task: `Representation`
+* Output: `Other value`
+* Output Type: `Float`
+* Output Shape: `Matrix`
+* Interpretation: Embedding vectors of each smiles are obtained, represented in a matrix, where each row is a vector of embedding of each smiles character, with a dimension of 400. The pretrained model is loaded using the fastai library
+
+## References
+
+* [Publication](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-020-00430-x)
+* [Source Code](https://github.com/XinhaoLi74/MolPMoFiT)
+* Ersilia contributor: [GemmaTuron](https://github.com/GemmaTuron)
+
+## Ersilia model URLs
+* [GitHub](https://github.com/ersilia-os/eos9zw0)
+* [AWS S3](https://ersilia-models-zipped.s3.eu-central-1.amazonaws.com/eos9zw0.zip)
+* [DockerHub](https://hub.docker.com/r/ersiliaos/eos9zw0) (AMD64, ARM64)
+
+## Citation
+
+If you use this model, please cite the [original authors](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-020-00430-x) of the model and the [Ersilia Model Hub](https://github.com/ersilia-os/ersilia/blob/master/CITATION.cff).
+
+## License
+
+This package is licensed under a GPL-3.0 license. The model contained within this package is licensed under a CC license.
+
+Notice: Ersilia grants access to these models 'as is' provided by the original authors, please refer to the original code repository and/or publication if you use the model in your research.
+
+## About Us
+
+The [Ersilia Open Source Initiative](https://ersilia.io) is a Non Profit Organization ([1192266](https://register-of-charities.charitycommission.gov.uk/charity-search/-/charity-details/5170657/full-print)) with the mission is to equip labs, universities and clinics in LMIC with AI/ML tools for infectious disease research.
+
+[Help us](https://www.ersilia.io/donate) achieve our mission!
diff --git a/metadata.json b/metadata.json
@@ -4,19 +4,33 @@
     "Status": "In progress",
     "Title": "Molecular Prediction Model Fine-Tuning (MolPMoFiT)",
     "Description": "Using self-supervised learning, the authors pre-trained a large model using one millon unlabelled molecules from ChEMBL. This model can subsequently be fine-tuned for various QSAR tasks. Here, we provide the encodings for the molecular structures using the pre-trained model, not the fine-tuned QSAR models.",
-    "Mode": "",
-    "Task": [],
-    "Input": [],
-    "Input Shape": "",
-    "Output": [],
-    "Output Type": [],
-    "Output Shape": "",
-    "Interpretation": "",
+    "Mode": "Pretrained",
+    "Task": [
+        "Representation"
+    ],
+    "Input": [
+        "Compound"
+    ],
+    "Input Shape": "Single",
+    "Output": [
+        "Other value"
+    ],
+    "Output Type": [
+        "Float"
+    ],
+    "Output Shape": "Matrix",
+    "Interpretation": "Embedding vectors of each smiles are obtained, represented in a matrix, where each row is a vector of embedding of each smiles character, with a dimension of 400. The pretrained model is loaded using the fastai library",
     "Tag": [
         "Descriptor",
         "Embedding"
     ],
     "Publication": "https://jcheminf.biomedcentral.com/articles/10.1186/s13321-020-00430-x",
     "Source Code": "https://github.com/XinhaoLi74/MolPMoFiT",
-    "License": "None"
+    "License": "CC",
+    "S3": "https://ersilia-models-zipped.s3.eu-central-1.amazonaws.com/eos9zw0.zip",
+    "DockerHub": "https://hub.docker.com/r/ersiliaos/eos9zw0",
+    "Docker Architecture": [
+        "AMD64",
+        "ARM64"
+    ]
 }
diff --git a/model/checkpoints/ChemBL_atom_vocab.pkl b/model/checkpoints/ChemBL_atom_vocab.pkl
diff --git a/model/checkpoints/models/ChemBL_atom_encoder.pth b/model/checkpoints/models/ChemBL_atom_encoder.pth
diff --git a/model/framework/code/main.py b/model/framework/code/main.py
@@ -1,21 +1,79 @@
 # imports
-import os
-import csv
+import pickle
+from rdkit import RDLogger 
+RDLogger.DisableLog('rdApp.*') # switch off RDKit warning messages
+from fastai import *
+from fastai.text import *
+from utils import *
+from sklearn.model_selection import train_test_split
+from torch.nn import functional as F
+
 import sys
-from rdkit import Chem
-from rdkit.Chem.Descriptors import MolWt
+import os
 
 # parse arguments
 input_file = sys.argv[1]
 output_file = sys.argv[2]
 
 # current file directory
 root = os.path.dirname(os.path.abspath(__file__))
+path= os.path.abspath(os.path.join(root,".."))
+path_to_checkpoint=os.path.abspath(os.path.join(root,"..", "..", "checkpoints"))
+path_vocab= os.path.abspath(os.path.join(root,"..","..","checkpoints", "ChemBL_atom_vocab.pkl"))
+path_bbp= os.path.abspath(os.path.join(root,"..","data","QSAR","bbbp.csv"))
+
+#Read the vocabulary
+with open(f'{path_vocab}', 'rb') as f:
+    orig_itos = pickle.load(f)
+
+#Load the vocabulary
+vocab = Vocab(orig_itos)
+print(f'Vocab Size: {len(vocab.itos)}')
+#Initialize the Tokenizer
+tok = Tokenizer(partial(MolTokenizer, special_tokens = special_tokens), n_cpus=6, pre_rules=[], post_rules=[])
+
+#read the dataset for  QSAR tasks. 
+#Load the data y data augmentation
+bbbp = pd.read_csv(path_bbp)
+
+train, test = train_test_split(bbbp,
+    test_size=0.1, shuffle = True, random_state = 8)
+
+train, val = train_test_split(train,
+    test_size=0.1, shuffle = True, random_state = 42)
+
+bs = 128 #batch size
+
+#Create the specific group of data for the language model, the same data used for train the model.
+#Build the fastai databunch
+qsar_db = TextClasDataBunch.from_df(path_to_checkpoint, train, val, bs=bs, tokenizer=tok, 
+                                    chunksize=50000, text_cols='smiles',label_cols='p_np', 
+                                    vocab=vocab, max_vocab=60000, include_bos=False)
+
+#create the classification/regression learner.
+cls_learner = text_classifier_learner(qsar_db, AWD_LSTM, pretrained=False, drop_mult=0.1)
+
+#The encoder of the model is loaded before training, and then access the first layer, the embeddings layer, and obtain them
+#learner.load_encoder() will load the model from path/models/
+cls_learner.load_encoder('ChemBL_atom_encoder')
+
+def get_normalized_embeddings():
+  return F.normalize(cls_learner.model[0].module.encoder.weight)
+
+embs_v1 = get_normalized_embeddings()
 
 # my model
 def my_model(smiles_list):
-    return [MolWt(Chem.MolFromSmiles(smi)) for smi in smiles_list]
+    list_embeddings=[]
+    tokenizer = MolTokenizer()
+    for smile in smiles_list:
+        smile_tokenizer=tokenizer.tokenizer(smile)
+        indices = [qsar_db.vocab.itos.index(token) for token in smile_tokenizer]
+        embes=embs_v1[indices][:, :].numpy()
+        embes = np.mean(embes, axis=0)
+        list_embeddings.append(embes)
 
+    return list_embeddings
 
 # read SMILES from .csv file, assuming one column with header
 with open(input_file, "r") as f:
@@ -34,6 +92,8 @@ def my_model(smiles_list):
 # write output in a .csv file
 with open(output_file, "w") as f:
     writer = csv.writer(f)
-    writer.writerow(["value"])  # header
+    writer.writerow(["emb-{0}".format(i) for i in range(400)])  # header
     for o in outputs:
-        writer.writerow([o])
+        writer.writerow(list(o))
+
+
diff --git a/model/framework/code/utils.py b/model/framework/code/utils.py
@@ -0,0 +1,160 @@
+from fastai import *
+from fastai.text import *
+from sklearn.metrics import roc_auc_score
+from rdkit import Chem
+import numpy as np
+import threading
+
+def randomize_smiles(smiles):
+    m = Chem.MolFromSmiles(smiles)
+    ans = list(range(m.GetNumAtoms()))
+    np.random.shuffle(ans)
+    nm = Chem.RenumberAtoms(m,ans)
+    return Chem.MolToSmiles(nm, canonical=False, isomericSmiles=True, kekuleSmiles=False)
+
+
+def smiles_augmentation(df, N_rounds, smiles_col="SMILES"):
+    dist_aug = {col_name: [] for col_name in df}
+
+    for i in range(df.shape[0]):
+        for j in range(N_rounds):
+            dist_aug[smiles_col].append(randomize_smiles(df.iloc[i][smiles_col]))
+            dist_aug['canonical'].append('no')
+
+    df_aug = pd.DataFrame.from_dict(dist_aug)
+
+    #merge with original df
+    df = pd.concat([df, df_aug], sort=False).reset_index(drop=True)
+    #shuffle the data
+    df = df.reindex(np.random.permutation(df.index))
+    return pd.DataFrame.from_dict(df).drop_duplicates(smiles_col)
+
+# Don't include the defalut specific token of fastai, only keep the padding token
+BOS,EOS,FLD,UNK,PAD = 'xxbos','xxeos','xxfld','xxunk','xxpad'
+TK_MAJ,TK_UP,TK_REP,TK_WREP = 'xxmaj','xxup','xxrep','xxwrep'
+defaults.text_spec_tok = [PAD]
+
+special_tokens = ['[BOS]', '[C@H]', '[C@@H]','[C@]', '[C@@]','[C-]','[C+]', '[c-]', '[c+]','[cH-]',
+                   '[nH]', '[N+]', '[N-]', '[n+]', '[n-]' '[NH+]', '[NH2+]',
+                   '[O-]', '[S+]', '[s+]', '[S-]', '[O+]', '[SH]', '[B-]','[BH2-]', '[BH3-]','[b-]',
+                   '[PH]','[P+]', '[I+]', 
+                  '[Si]','[SiH2]', '[Se]','[SeH]', '[se]', '[Se+]', '[se+]','[te]','[te+]', '[Te]']
+
+class MolTokenizer(BaseTokenizer):
+    def __init__(self, lang = 'en', special_tokens = special_tokens):
+        self.lang = lang
+        self.special_tokens = special_tokens
+
+    def tokenizer(self, smiles):
+        # add specific token '[BOS]' to represetences the start of SMILES
+        smiles = '[BOS]' + smiles
+        regex = '(\[[^\[\]]{1,10}\])'
+        char_list = re.split(regex, smiles)
+        tokens = []
+
+        if self.special_tokens:
+            for char in char_list:
+                if char.startswith('['):
+                    if char in special_tokens:
+                        tokens.append(str(char))
+                    else:
+                        tokens.append('[UNK]')
+                else:
+                    chars = [unit for unit in char]
+                    [tokens.append(i) for i in chars]                    
+
+        if not self.special_tokens:
+            for char in char_list:
+                if char.startswith('['):
+                    tokens.append(str(char))
+                else:
+                    chars = [unit for unit in char]
+                    [tokens.append(i) for i in chars]
+
+        #fix the 'Br' be splited into 'B' and 'r'
+        if 'B' in tokens:
+            for index, tok in enumerate(tokens):
+                if tok == 'B':
+                    if index < len(tokens)-1: # make sure 'B' is not the last character
+                        if tokens[index+1] == 'r':
+                            tokens[index: index+2] = [reduce(lambda i, j: i + j, tokens[index : index+2])]
+
+        #fix the 'Cl' be splited into 'C' and 'l'
+        if 'l' in tokens:
+            for index, tok in enumerate(tokens):
+                if tok == 'l':
+                    if tokens[index-1] == 'C':
+                            tokens[index-1: index+1] = [reduce(lambda i, j: i + j, tokens[index-1 : index+1])]
+        return tokens    
+
+    def add_special_cases(self, toks):
+        pass
+
+
+class MolTokenizer_SPE(BaseTokenizer):
+    def __init__(self, spe_tokenizer, lang = 'en'):
+        self.lang = lang
+        self.tok = spe_tokenizer
+
+    def tokenizer(self, smiles):
+        # add specific token '[BOS]' to represetences the start of SMILES
+        smiles = '[BOS]' + smiles
+
+        return self.tok.tokenize(smiles).split(' ')    
+
+    def add_special_cases(self, toks):
+        pass    
+
+def auroc_score(input, target):
+    input, target = input.cpu().numpy()[:,1], target.cpu().numpy()
+    return roc_auc_score(target, input)
+
+class AUROC(Callback):
+    _order = -20 #Needs to run before the recorder
+
+    def __init__(self, learn, **kwargs): self.learn = learn
+    def on_train_begin(self, **kwargs): self.learn.recorder.add_metric_names(['AUROC'])
+    def on_epoch_begin(self, **kwargs): self.output, self.target = [], []
+
+    def on_batch_end(self, last_target, last_output, train, **kwargs):
+        if not train:
+            self.output.append(last_output)
+            self.target.append(last_target)
+
+    def on_epoch_end(self, last_metrics, **kwargs):
+        if len(self.output) > 0:
+            output = torch.cat(self.output)
+            target = torch.cat(self.target)
+            preds = F.softmax(output, dim=1)
+            metric = auroc_score(preds, target)
+            return add_metrics(last_metrics, [metric])
+
+
+
+def test_get_scores(learn, ret=False):
+    preds = learn.get_preds(ordered=True)
+    print(f'Testing {len(preds[0])} molecues')
+    p = torch.argmax(preds[0], dim=1)
+    y = preds[1]
+    tp = ((p + y) == 2).sum().item()
+    tn = ((p + y) == 0).sum().item()
+    fp = (p > y).sum().item()
+    fn = (p < y).sum().item()
+    cc = (float(tp)*tn - fp*fn) / np.sqrt((tp + fp)*(tp + fn)*(tn + fp)*(tn + fn))
+
+    print(f'Accuracy: {(tp+tn)/len(y):.3f}')
+    print(f'False Positives: {fp/len(y):.3f}')
+    print(f'False Negatives: {fn/len(y):.3f}')
+    print(f'Recall: {tp / (tp + fn):.3f}')
+    print(f'Precision: {tp / (tp + fp):.3f}')
+    print(f'Sensitivity: {tp / (tp + fn):.3f}')
+    print(f'Specificity: {tn / (tn + fp):.3f}')
+    print(f'MCC: {cc:.3f}')
+    print(f'ROCAUC: {roc_auc_score(y,preds[0][:,1]):.3f}')
+
+    if ret:
+        return preds
+
+
+
+