-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathextract_smiles_from_file_name.py
46 lines (40 loc) · 1.7 KB
/
extract_smiles_from_file_name.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem.Descriptors import MolWt
from src.reward_utils import get_cycle_reward, get_logp_reward, get_qed_reward, get_sa_reward
from src.data_process_utils import standardize_smiles_error_handle
from generate_similar_mol import _canonicalize_smiles, _parallel_get_ext_dists, _parallel_get_fps
def get_file_names():
df_rows = []
for f in os.listdir('C://Users//yhytx//SEED//gen_samples_rl_similar_binder'):
if f.endswith(".png"):
smi = f.replace('.png', '')
smi_property = get_smi_property(smi)
df_rows.append(smi_property)
if smi_property['similarity'] == 1:
breakpoint()
breakpoint()
gen_samples_good_df = pd.DataFrame(df_rows)
gen_samples_good_df.to_csv('generated_molecules_similar_binder.csv', index=False)
def get_smi_property(smi):
mol = Chem.MolFromSmiles(smi)
mol_w = MolWt(mol)
fp = _parallel_get_fps(smi)
sim, sim_idx = _parallel_get_ext_dists(fp, ref_fps)
gen_sample = {}
gen_sample["Smiles"] = smi
gen_sample["NumAtoms"] = mol.GetNumAtoms()
gen_sample['logp'] = np.round(get_logp_reward(smi), 4)
gen_sample['sa'] = np.round(get_sa_reward(smi), 4)
gen_sample['cycle'] = get_cycle_reward(smi)
gen_sample['qed'] = np.round(get_qed_reward(smi), 4)
gen_sample['similarity'] = sim
gen_sample['mol_weight'] = mol_w
gen_sample['matched'] = sim_idx + 1
return gen_sample
if __name__ == "__main__":
ref_smiles_df = pd.read_csv('data/binders.csv').smiles.apply(standardize_smiles_error_handle).values.tolist()
ref_fps = [_parallel_get_fps(smi) for smi in ref_smiles_df]
get_file_names()