-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathheatmap_full_DHFR_from_raw_data.R
356 lines (287 loc) · 17 KB
/
heatmap_full_DHFR_from_raw_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
library(pheatmap)
library(stringr)
library(matrixStats)
library(RColorBrewer)
library(dplyr)
library(reshape2)
library(ggplot2)
library(gridExtra)
library(hash) #to store infos for the different tiles
# read in raw data as it comes from enrich. We use the file 'main_synonymous_scores.tsv' in the experiement folder of the tsv folder.
# for the full seq of dhfr that is i.e. tile1_all_reps_minvarcount10_corr_sy/tsv/tile1_all_reps_exp/main_synonymous_scores.tsv
# removes double mutants
# does not do anything about vars that occur in two tiles since each tile is plotted separately
# tile borders have to specified inside the 'make_tile_data' function
# plots each tile from the specified start position to end position
# might have issues if positions are supplied for which there is no data.
# Same with substitutions for which there is no data, which is why I take Ter out of the order of aa substitutions for tiles
# 3,4,5 since they appear to not have scores for early terminations.
# To make a heatmap encompassing all tiles, first the data needs to be compiled into a single dataframe. This is done in the script:
# prep_MAVE_data.R
#OBS: change paths to what is applicable for you. You can find all lines where paths are mentioned by searching for 'base_dir'
#based on heatmap_shorter.R . For the new full dhfr MAVE data that only has one condition but all 5 tiles (should be full DHFR)
#heatmap making functions from heatmap_shorter.R
base_dir <- '/home/henrike/Documents/PD_AS/projects/Sofie_Mave/results/enrich_full_dhfr/'
corr_descr <- 'corrected with synonymous counts' #'corrected with counted reads' #'corrected with WT counts' # 'corrected with all reads' # 'corrected with counted reads'
corr <- '_corr_sy' #'_corr_complete' #'' #'_corr_full' #'_corr_complete'
#tile <- "5"
#dict storing necessary infos on the 5 different tiles
############################
make_tile_data <- function(){
h <- hash()
# set values
h[["1"]] <- hash()
h[['1']][['start_pos']] <- 2
h[['1']][['tile_end']] <- 39
h[['1']][['WT_seq']] <- c('Val','Gly','Ser','Leu','Asn','Cys','Ile','Val','Ala','Val','Ser','Gln','Asn','Met','Gly','Ile','Gly','Lys',
'Asn','Gly','Asp','Leu','Pro','Trp','Pro','Pro','Leu','Arg','Asn','Glu','Phe','Arg','Tyr','Phe','Gln','Arg','Met','Thr')
h[["2"]] <- hash()
#the mutagenized region is 31 - 77 but we only have read overlap (from fw rv pair) from aa 37 to 72
h[['2']][['start_pos']] <- 37
h[['2']][['tile_end']] <- 72
h[['2']][['WT_seq']] <- c('Arg', 'Met', 'Thr', 'Thr', 'Thr', 'Ser', 'Ser', 'Val', 'Glu', 'Gly', 'Lys', 'Gln', 'Asn', 'Leu',
'Val', 'Ile', 'Met', 'Gly', 'Lys', 'Lys', 'Thr', 'Trp', 'Phe', 'Ser', 'Ile', 'Pro', 'Glu', 'Lys',
'Asn', 'Arg', 'Pro', 'Leu', 'Lys', 'Gly', 'Arg', 'Ile')
h[["3"]] <- hash()
#the mutagenized region is 75 - 121 but we only have read overlap (from fw rv pair) from aa 81 to 116
h[['3']][['start_pos']] <- 81
h[['3']][['tile_end']] <- 116
h[['3']][['WT_seq']] <- c('Lys', 'Glu', 'Pro', 'Pro', 'Gln', 'Gly', 'Ala', 'His', 'Phe', 'Leu', 'Ser', 'Arg', 'Ser', 'Leu', 'Asp',
'Asp', 'Ala', 'Leu', 'Lys', 'Leu', 'Thr', 'Glu', 'Gln', 'Pro', 'Glu', 'Leu', 'Ala', 'Asn', 'Lys', 'Val',
'Asp', 'Met', 'Val', 'Trp', 'Ile', 'Val' )
h[["4"]] <- hash()
#the mutagenized region is 119 - 156 but we have read overlap (from fw rv pair) from aa 116 to 159 so I included counting those muts
#there should be none if I understand correctly
h[['4']][['start_pos']] <- 116
h[['4']][['tile_end']] <- 159
h[['4']][['WT_seq']] <- c('Val', 'Gly', 'Gly', 'Ser', 'Ser', 'Val', 'Tyr', 'Lys', 'Glu', 'Ala', 'Met', 'Asn', 'His', 'Pro', 'Gly',
'His', 'Leu', 'Lys', 'Leu', 'Phe', 'Val', 'Thr', 'Arg', 'Ile', 'Met', 'Gln', 'Asp', 'Phe', 'Glu', 'Ser',
'Asp', 'Thr', 'Phe', 'Phe', 'Pro', 'Glu', 'Ile', 'Asp', 'Leu', 'Glu', 'Lys', 'Tyr', 'Lys', 'Leu')
h[["5"]] <- hash()
#the mutagenized region is 153 - 187 but we have read overlap (from fw rv pair) from aa 146 til the stop codon (pos after 187)
h[['5']][['start_pos']] <- 146
h[['5']][['tile_end']] <- 187
h[['5']][['WT_seq']] <- c('Asp', 'Thr', 'Phe', 'Phe', 'Pro', 'Glu', 'Ile', 'Asp', 'Leu', 'Glu', 'Lys', 'Tyr', 'Lys', 'Leu', 'Leu',
'Pro', 'Glu', 'Tyr', 'Pro', 'Gly', 'Val', 'Leu', 'Ser', 'Asp', 'Val', 'Gln', 'Glu', 'Glu', 'Lys', 'Gly',
'Ile', 'Lys', 'Tyr', 'Lys', 'Phe', 'Glu', 'Val', 'Tyr', 'Glu', 'Lys', 'Asn', 'Asp')
return(h)
}
h <- make_tile_data()
# get values example
#h[["1"]]
#h[['1']][['start_pos']]
#h[[tile]][['start_pos']]
#################################
#for including different conditions see same function in script heatmaps_shorter.R
make_heatmap <- function(dat, start_pos, tile_end, pos_WT, aa_WT, order_aa, min_score=0,max_score=0, descr='', mid_point = 0) {
#get the synonymous and WT scores and subset data to only scores for single vars
sy_score <- dat[1,'score']
wt_score <- dat[2,'score']
#now omit these two rows
dat <- dat[-c(1,2),]
#get only scores for single mutants
d <- dat[!grepl(',',dat$var, fixed = T),c('var','score')]
#colnames(d) <- c('var','score')
#make cols for position and substiution
d$pos <- as.numeric(str_extract(d$var, "([0-9]+)"))
d$sub <- str_match(d$var, "[0-9]+([A-Za-z]+)")[,2]
#drop empty rows
d <- d[complete.cases(d),]
#order by position
d <- arrange(d, pos)
#cast into matrix format
mat <- dcast(d, pos ~ sub, value.var = "score")
rownames(mat) <- mat$pos
#remove pos column
mat <- mat[,-c(1)]
mat$pos_median = rowMedians(as.matrix(mat), na.rm = T)
#need to padd missing rows, i.e. positions where nothing has scored with NA
all_rows <- seq(start_pos,tile_end)
#missing rows:
missing_rows <- all_rows[!all_rows %in% rownames(mat)]
for (i in missing_rows) {
mat[as.character(i),] <- rep(NA,ncol(mat))
}
#arrange cols == amino acids according to properties and rows by row number
#https://stackoverflow.com/questions/32593434/r-change-row-order
mat <- mat[order(as.numeric(row.names(mat))),order_aa]
#create dummy for marking the wild type in the plot
dummy <- rep(wt_score,length(pos_WT))
WT <- data.frame(pos_WT, aa_WT, dummy)
if (min_score == 0)
{min_score <- min(d$score, na.rm = T) - 0.1}
if (max_score == 0)
{max_score <- max(d$score, na.rm = T) + 0.1}
print(paste0('min_score is: ', min_score))
print(paste0('max_score is: ', max_score))
p <- ggplot(melt(as.matrix(mat)), aes(x=Var1, y=Var2)) +
geom_tile(aes(fill = value), color = "grey50") +
#https://stackoverflow.com/questions/29476925/change-colors-of-raster-plot-in-ggplot2
#scale_fill_gradientn(name = "MAVE Score", colors = c("#830823", "#c3393b", "#d6604d", "#f09b7a", "#fce2d3",
# "#f9f2ee", "white", "#e1edf3", "#c0ddec", "#a6cfe4", "#3683bb",
# "#1b5b9d", "#063264")) +
#use this to find automatic limits to the color scale
#scale_fill_gradient2(name = "MAVE Score", midpoint = 0, low = "#830823", high = "#063264", mid = 'white',
# limits = c(-max(abs(min(mat, na.rm = T)),abs(max(mat, na.rm = T))),max(abs(min(mat, na.rm = T)),abs(max(mat, na.rm = T))))) +
#add WT markers. Lets try drawing black circles
geom_tile(data = WT, aes(x = pos_WT, y=aa_WT, fill = dummy), color = 'grey50') + #scale_color_manual(values = c("white")) +
geom_point(data = WT, aes(x = pos_WT, y=aa_WT), size = 3, color = 'black') +
#this works but only displays x axis ticks every 10 positions
#xlab('pos')+ ylab('substitution') +
#which x-axis ticks to draw. It seems I need to use 1 - tile_end as limits, otherwise it just displays no x-axis scale.
#the labels and breaks are me trying to get it to display only labels between tile start and tile end, not all the way from 1.
scale_x_discrete(name ="pos", breaks=seq(start_pos,tile_end,1), labels=as.character(seq(start_pos,tile_end,1)),
limits=as.factor(seq(1,tile_end,1))) + ylab('substitution') +
#limit plot to from start of the tile to the end so we cut off all the part from 1 to start_tile
coord_cartesian(xlim = c(start_pos,tile_end)) +
#ggtitle(paste0("MAVE scores tile ", tile, ", 0 mismatch allowed, min base qual 1, average read qual 1")) +
ggtitle(descr) +
theme(axis.text=element_text(size=16), axis.title=element_text(size=18), legend.text=element_text(size=18),
plot.title = element_text(size=22))
#add color scale later so we can determine what the mid point should be
if(mid_point == 'wt'){
#use this to use the same color scale limits for all heatmaps
p <- p + scale_fill_gradient2(name = "MAVE Score", midpoint = wt_score, low = "#830823", high = "#063264", mid = 'white',
limits = c(min_score,max_score))
} else if(mid_point == 'sy'){
p <- p + scale_fill_gradient2(name = "MAVE Score", midpoint = sy_score, low = "#830823", high = "#063264", mid = 'white',
limits = c(min_score,max_score))
} else {
p <- p + scale_fill_gradient2(name = "MAVE Score", midpoint = mid_point, low = "#830823", high = "#063264", mid = 'white',
limits = c(min_score,max_score))
}
#debugging
#####################
#png(filename = paste0(base_dir,'HZ_plots/heatmap_tile',tile,'.test.png'), width = 1400, height = 800)
#print(p)
#dev.off()
return(p)
}
make_error_heatmap <- function(dat, start_pos, tile_end, pos_WT, aa_WT, order_aa, max_err=0, descr='') {
#get the synonymous and WT scores and subset data to only scores for single vars
sy_error <- dat[1,'SE']
wt_error <- dat[2,'SE']
#now omit these two rows
dat <- dat[-c(1,2),]
#get only errors for single mutants (this matrix should represent the same vars as make_heatmap but show the error instead of score)
d <- dat[!grepl(',',dat$var, fixed = T),c('var','SE')]
if (max_err == 0){
max_err <- max(d$SE) + 0.1
}
print(paste0('max_err is: ', max_err))
#colnames(d) <- c('var','score')
#make cols for position and substiution
d$pos <- as.numeric(str_extract(d$var, "([0-9]+)"))
d$sub <- str_match(d$var, "[0-9]+([A-Za-z]+)")[,2]
#drop empty rows
d <- d[complete.cases(d),]
#order by position
d <- arrange(d, pos)
#cast into matrix format
mat <- dcast(d, pos ~ sub, value.var = "SE")
rownames(mat) <- mat$pos
#remove pos column
mat <- mat[,-c(1)]
mat$pos_median = rowMedians(as.matrix(mat), na.rm = T)
#need to padd missing rows, i.e. positions where nothing has scored with NA
all_rows <- seq(start_pos,tile_end)
#missing rows:
missing_rows <- all_rows[!all_rows %in% rownames(mat)]
for (i in missing_rows) {
mat[as.character(i),] <- rep(NA,ncol(mat))
}
#arrange cols == amino acids according to properties and rows by row number
#https://stackoverflow.com/questions/32593434/r-change-row-order
mat <- mat[order(as.numeric(row.names(mat))),order_aa]
#create dummy for marking the wild type in the plot
dummy <- rep(wt_error,length(pos_WT))
WT <- data.frame(pos_WT, aa_WT, dummy)
p <- ggplot(melt(as.matrix(mat)), aes(x=Var1, y=Var2)) +
geom_tile(aes(fill = value), color = "grey50") +
#https://stackoverflow.com/questions/29476925/change-colors-of-raster-plot-in-ggplot2
#scale_fill_gradientn(name = "MAVE Score", colors = c("#830823", "#c3393b", "#d6604d", "#f09b7a", "#fce2d3",
# "#f9f2ee", "white", "#e1edf3", "#c0ddec", "#a6cfe4", "#3683bb",
# "#1b5b9d", "#063264")) +
#use this to find automatic limits to the color scale
#scale_fill_gradient2(name = "MAVE Score", midpoint = 0, low = "#830823", high = "#063264", mid = 'white',
# limits = c(-max(abs(min(mat, na.rm = T)),abs(max(mat, na.rm = T))),max(abs(min(mat, na.rm = T)),abs(max(mat, na.rm = T))))) +
#use this to use the same color scale limits for all heatmaps
scale_fill_gradient(name = "Standard Error", low = "white", high = "#830823",
limits = c(0,max_err)) +
#add WT markers. Lets try drawing black circles
geom_tile(data = WT, aes(x = pos_WT, y=aa_WT, fill = dummy), color = 'grey50') + #scale_color_manual(values = c("white")) +
geom_point(data = WT, aes(x = pos_WT, y=aa_WT), size = 3, color = 'black') +
#this works but only displays x axis ticks every 10 positions
#xlab('pos')+ ylab('substitution') +
#which x-axis ticks to draw. It seems I need to use 1 - tile_end as limits, otherwise it just displays no x-axis scale.
#the labels and breaks are me trying to get it to display only labels between tile start and tile end, not all the way from 1.
scale_x_discrete(name ="pos", breaks=seq(start_pos,tile_end,1), labels=as.character(seq(start_pos,tile_end,1)),
limits=as.factor(seq(1,tile_end,1))) + ylab('substitution') +
#limit plot to from start of the tile to the end
coord_cartesian(xlim = c(start_pos,tile_end)) +
#ggtitle(paste0("MAVE score errors tile ", tile, ", 0 mismatch allowed, min base qual 1, average read qual 1")) +
ggtitle(descr) +
theme(axis.text=element_text(size=16), axis.title=element_text(size=18), legend.text=element_text(size=18),
plot.title = element_text(size=22))
return(p)
}
#go through all 5 tiles and make heatmaps
##########################
tile <- '1' #for debug
for (tile in c('1','2','3','4','5')) {
print(c('tile', tile))
#position of the first residue
start_pos <- h[[tile]][['start_pos']]
#last position
tile_end <- h[[tile]][['tile_end']]
pos_WT <- seq(start_pos,tile_end,1)
#this is the WT aa seq, change for different tiles of dhfr
aa_WT <- h[[tile]][['WT_seq']]
#order of amino acids for the heatmap
#In the last tile there are apparently no early termination single vars so I took that out for tile 5. Same with tiles 3 and 4
if(tile == '5' | tile == '3' | tile == '4'){
order_aa <- c('His', 'Lys', 'Arg', 'Asp', 'Glu', 'Cys', 'Met', 'Asn', 'Gln', 'Ser', 'Thr', 'Ala',
'Ile', 'Leu', 'Val', 'Phe', 'Trp', 'Tyr', 'Gly', 'Pro', 'pos_median')
} else{
order_aa <- c('His', 'Lys', 'Arg', 'Asp', 'Glu', 'Cys', 'Met', 'Asn', 'Gln', 'Ser', 'Thr', 'Ala',
'Ile', 'Leu', 'Val', 'Phe', 'Trp', 'Tyr', 'Gly', 'Pro', 'Ter', 'pos_median')
}
#minimum 10 read counts to accept a var as real
##########################
dat <- read.csv(paste0(base_dir,'tile', tile ,'_all_reps_minvarcount10',corr,'/tsv/tile',tile ,'_all_reps_exp/main_synonymous_scores.tsv'),
sep = '\t', header = F,
stringsAsFactors = F, skip = 2,
col.names = c('var', 'SE', 'epsilon', 'score'))
#find out max and min scores
print('minimum 10 DNA var counts')
mid_point <- 'wt'
#run first without args to get the min and max scores
make_heatmap(dat = dat, start_pos = start_pos, tile_end = tile_end, pos_WT = pos_WT, aa_WT = aa_WT, order_aa = order_aa)
#now fill in min_score and max_score. I often put the same one for all tiles so they are on the same scale so that means
#it needs to run once for each tile and then use the min of all min_scores and the max of all max_scores for making the
#plots to be saved as files. For the full DHFR MAVE I chose -3.5 and +4
p1 <- make_heatmap(dat = dat, start_pos = start_pos, tile_end = tile_end, pos_WT = pos_WT, aa_WT = aa_WT, order_aa = order_aa
, min_score = -3.5, max_score = 4.0, mid_point = mid_point
, descr = paste0("MAVE scores tile ", tile, ", ", corr_descr, ", mid point: ", mid_point,
"\n0 mismatch allowed, min base qual 1, average read qual 1, ",
"only DNA vars with minimum 10 read counts"))
p1
png(filename = paste0(base_dir,'HZ_plots/heatmaps/heatmap_tile',tile,'.minvarcount10', corr,'.midpoint',mid_point,'.png'),
width = 1400, height = 800)
print(p1)
dev.off()
#run first without args to get the max error
make_error_heatmap(dat = dat, start_pos = start_pos, tile_end = tile_end, pos_WT = pos_WT, aa_WT = aa_WT, order_aa = order_aa)
#I used 1.0 as max error for all tiles after looking at the different max for the different tiles
max_err = 1.0
p2 <- make_error_heatmap(dat = dat, start_pos = start_pos, tile_end = tile_end, pos_WT = pos_WT, aa_WT = aa_WT, order_aa = order_aa
, max_err = max_err
, descr = paste0("MAVE score errors tile ", tile, ", ", corr_descr,
"\n0 mismatch allowed, min base qual 1, average read qual 1, ",
"only DNA vars with minimum 10 read counts"))
p2
png(filename = paste0(base_dir,'HZ_plots/heatmaps/heatmap_tile',tile,'.err.minvarcount10', corr,'.max_err',max_err,'.png'),
width = 1400, height = 800)
print(p2)
dev.off()
}