-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils_benchmark.py
359 lines (303 loc) · 14.1 KB
/
utils_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
import os
import glob
import pandas as pd
import numpy as np
import random
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import check_array, as_float_array
from sklearn.base import TransformerMixin, BaseEstimator
import kneed
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
def load_data(exp, plate, filetype):
"""load all data from a single experiment into a single dataframe"""
path = os.path.join('profiles',
f'{exp}',
f'{plate}',
f'*_{filetype}')
files = glob.glob(path)
print(files)
df = pd.concat(pd.read_csv(_, low_memory=False) for _ in files)
return df
def get_metacols(df):
"""return a list of metadata columns"""
return [c for c in df.columns if c.startswith("Metadata_")]
def get_featurecols(df):
"""returna list of featuredata columns"""
return [c for c in df.columns if not c.startswith("Metadata")]
def get_metadata(df):
"""return dataframe of just metadata columns"""
return df[get_metacols(df)]
def get_featuredata(df):
"""return dataframe of just featuredata columns"""
return df[get_featurecols(df)]
def remove_negcon_empty_wells(df):
"""return dataframe of non-negative control wells"""
df = (
df.query('Metadata_control_type!="negcon"')
.dropna(subset=['Metadata_broad_sample'])
.reset_index(drop=True)
)
return df
def select_only_controls(df):
""" return dataframe of only controls, without outer wells"""
#df = (
# df.query('Metadata_Well!="A*"' and 'Metadata_Well!="P*"' and 'Metadata_Well!="*01"' and 'Metadata_Well!="*24"'
# and 'Metadata_control_type!="poscon_orf"' and 'Metadata_pert_type=="control"')
# )
df = (
df.query('Metadata_pert_type=="control"')
)
return df
def concat_profiles(df1, df2):
"""Concatenate dataframes"""
if df1.shape[0] == 0:
df1 = df2.copy()
else:
frames = [df1, df2]
df1 = pd.concat(frames, ignore_index=True, join="inner")
return df1
def percent_score(null_dist, corr_dist, how):
"""
Calculates the Percent strong or percent recall scores
:param null_dist: Null distribution
:param corr_dist: Correlation distribution
:param how: "left", "right" or "both" for using the 5th percentile, 95th percentile or both thresholds
:return: proportion of correlation distribution beyond the threshold
"""
if how == 'right':
perc_95 = np.nanpercentile(null_dist, 95)
above_threshold = corr_dist > perc_95
return np.mean(above_threshold.astype(float))*100, perc_95
if how == 'left':
perc_5 = np.nanpercentile(null_dist, 5)
below_threshold = corr_dist < perc_5
return np.mean(below_threshold.astype(float))*100, perc_5
if how == 'both':
perc_95 = np.nanpercentile(null_dist, 95)
above_threshold = corr_dist > perc_95
perc_5 = np.nanpercentile(null_dist, 5)
below_threshold = corr_dist < perc_5
return (np.mean(above_threshold.astype(float)) + np.mean(below_threshold.astype(float)))*100, perc_95, perc_5
def corr_between_replicates(df, group_by_feature, percent_matching=False):
"""
Correlation between replicates
Parameters:
-----------
df: pd.DataFrame
group_by_feature: Feature name to group the data frame by
Returns:
--------
list-like of correlation values
"""
replicate_corr = []
replicate_grouped = df.groupby(group_by_feature)
for name, group in replicate_grouped:
group_features = get_featuredata(group)
corr = np.corrcoef(group_features)
if len(group_features) == 1: # If there is only one replicate on a plate
replicate_corr.append(np.nan)
else:
if percent_matching:
corr_df = pd.DataFrame(corr)
corr_df.set_axis(list(group['Metadata_pert_iname']), axis=0, inplace=True)
corr_df.set_axis(list(group['Metadata_pert_iname']), axis=1, inplace=True)
for i in range(len(group.Metadata_pert_iname.unique())):
corr_df.loc[group.Metadata_pert_iname.unique()[i], group.Metadata_pert_iname.unique()[i]] = np.nan
corr = np.array(corr_df)
else:
np.fill_diagonal(corr, np.nan)
replicate_corr.append(np.nanmedian(corr)) # median replicate correlation
return replicate_corr
def corr_between_non_replicates(df, n_samples, n_replicates, metadata_compound_name):
"""
Null distribution between random "replicates".
Parameters:
------------
df: pandas.DataFrame
n_samples: int
n_replicates: int
metadata_compound_name: Compound name feature
Returns:
--------
list-like of correlation values, with a length of `n_samples`
"""
df.reset_index(drop=True, inplace=True)
null_corr = []
while len(null_corr) < n_samples:
compounds = random.choices([_ for _ in range(len(df))], k=n_replicates)
sample = df.loc[compounds].copy()
if len(sample[metadata_compound_name].unique()) == n_replicates:
sample_features = get_featuredata(sample)
corr = np.corrcoef(sample_features)
np.fill_diagonal(corr, np.nan)
null_corr.append(np.nanmedian(corr)) # median replicate correlation
return null_corr
def correlation_between_modalities(modality_1_df, modality_2_df, modality_1, modality_2, metadata_common, metadata_perturbation):
"""
Compute the correlation between two different modalities.
:param modality_1_df: Profiles of the first modality
:param modality_2_df: Profiles of the second modality
:param modality_1: feature that identifies perturbation pairs
:param modality_2: perturbation name feature
:param metadata_common: perturbation name feature
:param metadata_perturbation: perturbation name feature
:return: list-like of correlation values
"""
list_common_perturbation_groups = list(np.intersect1d(list(modality_1_df[metadata_common]), list(modality_2_df[metadata_common])))
merged_df = pd.concat([modality_1_df, modality_2_df], ignore_index=False, join='inner')
modality_1_df = merged_df.query('Metadata_modality==@modality_1')
modality_2_df = merged_df.query('Metadata_modality==@modality_2')
corr_modalities = []
for group in list_common_perturbation_groups:
modality_1_perturbation_df = modality_1_df.loc[modality_1_df[metadata_common] == group]
modality_2_perturbation_df = modality_2_df.loc[modality_2_df[metadata_common] == group]
for sample_1 in modality_1_perturbation_df[metadata_perturbation].unique():
for sample_2 in modality_2_perturbation_df[metadata_perturbation].unique():
modality_1_perturbation_sample_df = modality_1_perturbation_df.loc[modality_1_perturbation_df[metadata_perturbation] == sample_1]
modality_2_perturbation_sample_df = modality_2_perturbation_df.loc[modality_2_perturbation_df[metadata_perturbation] == sample_2]
modality_1_perturbation_profiles = get_featuredata(modality_1_perturbation_sample_df)
modality_2_perturbation_profiles = get_featuredata(modality_2_perturbation_sample_df)
corr = np.corrcoef(modality_1_perturbation_profiles, modality_2_perturbation_profiles)
corr = corr[0:len(modality_1_perturbation_profiles), len(modality_1_perturbation_profiles):]
corr_modalities.append(np.nanmedian(corr)) # median replicate correlation
return corr_modalities
def null_correlation_between_modalities(modality_1_df, modality_2_df, modality_1, modality_2, metadata_common, metadata_perturbation, n_samples):
"""
Compute the correlation between two different modalities.
:param modality_1_df: Profiles of the first modality
:param modality_2_df: Profiles of the second modality
:param modality_1: "Compound", "ORF" or "CRISPR"
:param modality_2: "Compound", "ORF" or "CRISPR"
:param metadata_common: feature that identifies perturbation pairs
:param metadata_perturbation: perturbation name feature
:param n_samples: int
:return:
"""
list_common_perturbation_groups = list(np.intersect1d(list(modality_1_df[metadata_common]), list(modality_2_df[metadata_common])))
merged_df = pd.concat([modality_1_df, modality_2_df], ignore_index=False, join='inner')
modality_1_df = merged_df.query('Metadata_modality==@modality_1')
modality_2_df = merged_df.query('Metadata_modality==@modality_2')
null_modalities = []
count = 0
while count < n_samples:
perturbations = random.choices(list_common_perturbation_groups, k=2)
modality_1_perturbation_df = modality_1_df.loc[modality_1_df[metadata_common] == perturbations[0]]
modality_2_perturbation_df = modality_2_df.loc[modality_2_df[metadata_common] == perturbations[1]]
for sample_1 in modality_1_perturbation_df[metadata_perturbation].unique():
for sample_2 in modality_2_perturbation_df[metadata_perturbation].unique():
modality_1_perturbation_sample_df = modality_1_perturbation_df.loc[modality_1_perturbation_df[metadata_perturbation] == sample_1]
modality_2_perturbation_sample_df = modality_2_perturbation_df.loc[modality_2_perturbation_df[metadata_perturbation] == sample_2]
modality_1_perturbation_profiles = get_featuredata(modality_1_perturbation_sample_df)
modality_2_perturbation_profiles = get_featuredata(modality_2_perturbation_sample_df)
corr = np.corrcoef(modality_1_perturbation_profiles, modality_2_perturbation_profiles)
corr = corr[0:len(modality_1_perturbation_profiles), len(modality_1_perturbation_profiles):]
null_modalities.append(np.nanmedian(corr)) # median replicate correlation
count += 1
return null_modalities
class ZCA_corr(BaseEstimator, TransformerMixin):
def __init__(self, copy=False):
self.copy = copy
def estimate_regularization(self, eigenvalue):
x = [_ for _ in range(len(eigenvalue))]
kneedle = kneed.KneeLocator(x, eigenvalue, S=1.0, curve='convex', direction='decreasing')
reg = eigenvalue[kneedle.elbow]/10.0
return reg # The complex part of the eigenvalue is ignored
def fit(self, X, y=None):
"""
Compute the mean, sphering and desphering matrices.
Parameters
----------
X : array-like with shape [n_samples, n_features]
The data used to compute the mean, sphering and desphering
matrices.
"""
X = check_array(X, accept_sparse=False, copy=self.copy, ensure_2d=True)
X = as_float_array(X, copy=self.copy)
self.mean_ = X.mean(axis=0)
X_ = X - self.mean_
cov = np.dot(X_.T, X_) / (X_.shape[0] - 1)
V = np.diag(cov)
df = pd.DataFrame(X_)
corr = np.nan_to_num(df.corr()) # replacing nan with 0 and inf with large values
G, T, _ = scipy.linalg.svd(corr)
regularization = self.estimate_regularization(T.real)
t = np.sqrt(T.clip(regularization))
t_inv = np.diag(1.0 / t)
v_inv = np.diag(1.0/np.sqrt(V.clip(1e-3)))
self.sphere_ = np.dot(np.dot(np.dot(G, t_inv), G.T), v_inv)
return self
def transform(self, X, y=None, copy=None):
"""
Parameters
----------
X : array-like with shape [n_samples, n_features]
The data to sphere along the features axis.
"""
check_is_fitted(self, "mean_")
X = as_float_array(X, copy=self.copy)
return np.dot(X - self.mean_, self.sphere_.T)
def sphere_plate_zca_corr(plate):
"""
sphere each plate to the DMSO negative control values
Parameters:
-----------
plate: pandas.DataFrame
dataframe of a single plate's featuredata and metadata
Returns:
-------
pandas.DataFrame of the same shape as `plate`
"""
# sphere featuredata to DMSO sphering matrix
spherizer = ZCA_corr()
dmso_df = plate.loc[plate.Metadata_control_type=="negcon"]
dmso_vals = get_featuredata(dmso_df).to_numpy()
all_vals = get_featuredata(plate).to_numpy()
spherizer.fit(dmso_vals)
sphered_vals = spherizer.transform(all_vals)
# concat with metadata columns
feature_df = pd.DataFrame(
sphered_vals, columns=get_featurecols(plate), index=plate.index
)
metadata = get_metadata(plate)
combined = pd.concat([metadata, feature_df], axis=1)
assert combined.shape == plate.shape
return combined
def distribution_plot(df, output_file, metric):
if metric == 'Percent Replicating':
metric_col = 'Percent_Replicating'
null = 'Null_Replicating'
null_label = 'non-replicates'
signal = 'Replicating'
signal_label = 'replicates'
x_label = 'Replicate correlation'
elif metric == 'Percent Matching':
metric_col = 'Percent_Matching'
null = 'Null_Matching'
null_label = 'non-matching perturbations'
signal = 'Matching'
signal_label = 'matching perturbations'
x_label = 'Correlation between perturbations targeting the same gene'
n_experiments = len(df)
plt.rcParams['figure.facecolor'] = 'white' # Enabling this makes the figure axes and labels visible in PyCharm Dracula theme
plt.figure(figsize=[12, n_experiments * 6])
for i in range(n_experiments):
plt.subplot(n_experiments, 1, i + 1)
plt.hist(df.loc[i, f'{null}'], label=f'{null_label}', density=True, bins=20, alpha=0.5)
plt.hist(df.loc[i, f'{signal}'], label=f'{signal_label}', density=True, bins=20, alpha=0.5)
plt.axvline(df.loc[i, 'Value_95'], label='95% threshold')
plt.legend(fontsize=20)
plt.title(
f"{df.loc[i, 'Description']}\n" +
f"{metric} = {df.loc[i, f'{metric_col}']}",
fontsize=25
)
plt.ylabel("density", fontsize=25)
plt.xlabel(f"{x_label}", fontsize=25)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
sns.despine()
plt.tight_layout()
plt.savefig(f'figures/{output_file}')
plt.show()