Skip to content

Commit

Permalink
merge deconvolution improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
Schumann committed May 5, 2022
2 parents c56146a + 342e340 commit d692764
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 52 deletions.
1 change: 1 addition & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dist_pkglibexec_scripts_SCRIPTS = \
scripts/deconvolution.R \
scripts/download_databases.sh \
scripts/generateNavigation.R \
scripts/renderReport.R \
scripts/get_qc_table.py \
scripts/overview_QC_table.R \
scripts/parse_vep.py \
Expand Down
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ find_or_override_prog([KRAKEN2], [kraken2])
find_or_override_prog([KRAKEN2_BUILD], [kraken2-build])
find_or_override_prog([IMPORT_KRONA], [ktImportKrona])
find_or_override_prog([IMPORT_TAXONOMY], [ktImportTaxonomy])
find_or_override_prog([IVAR], [ivar])
find_or_override_prog([IVAR], [ivar trim])
find_or_override_prog([LOFREQ], [lofreq])
find_or_override_prog([VEP], [vep])
find_or_override_prog([MULTIQC], [multiqc])
Expand Down
2 changes: 1 addition & 1 deletion etc/settings.yaml.in
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ tools:
args: ""
ivar:
executable: @IVAR@
args: ""
args: "trim -q 15 -m 180 -s 4 -e"
lofreq:
executable: @LOFREQ@
args: ""
Expand Down
38 changes: 21 additions & 17 deletions scripts/deconvolution.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ createSigMatrix <- function ( mutations.vector, mutation_sheet ) {
mutations.df <- mutations.df[,-(which(names(mutations.df) %in% "source"))]
}
# create an empty data frame add a column for the Wildtype
# Wildtype in this case means the reference version of SARS-Cov-2
# "Others" means that the particular mutation is not found and the mutation site could be mutated otherwise or not at all

msig <- setNames( data.frame( matrix( ncol = ncol(mutations.df)+1, nrow = 0 )), c("WT", colnames(mutations.df)))
msig <- setNames( data.frame( matrix( ncol = ncol(mutations.df)+1, nrow = 0 )), c("Others", colnames(mutations.df)))
msig <- bind_rows(tibble(muts=mutations.vector), msig)
# making a matrix with the signature mutations found in the sample
# make binary matrix matching the mutations to the mutation-list per variant to see how many characterising mutations
Expand All @@ -27,26 +27,30 @@ createSigMatrix <- function ( mutations.vector, mutation_sheet ) {
return( msig[,-match('muts', names(msig))]*1 ) # use the *1 to turn true/false to 0/1
}

simulateWT <- function ( mutations.vector, bulk_freq.vector, simple_sigmat.dataframe, coverage.vector) {
# for the deconvolution to work we need the "wild type" frequencies too. The matrix from above got mirrored,
# wild type mutations are simulated the following: e.g. T210I (mutation) -> T210T ("wild type")
simulateOthers <- function ( mutations.vector, bulk_freq.vector, simple_sigmat.dataframe, coverage.vector, Others_weight) {
#' for the deconvolution to work we need the "wild type" frequencies too. The matrix from above got mirrored,
#' wild type mutations are simulated the following: e.g. T210I (mutation) -> T210T ("wild type")

# 1. make "WT mutations"
muts_wt <- lapply(mutations.vector,function(x) str_replace(x,regex(".$"),
# 1. make "Others mutations"
muts_Others <- lapply(mutations.vector,function(x) str_replace(x,regex(".$"),
str_sub(str_split(x,":")[[1]][2], 1,1)))
muts_wt.df <- data.frame(muts = unlist(muts_wt))
muts_Others.df <- data.frame(muts = unlist(muts_Others))
# 2. make frequency values, subtract the observed freqs for the real mutations from 1
bulk_wt <- lapply(bulk_freq.vector, function (x) {1-x})
bulk_Others <- lapply(bulk_freq.vector, function (x) {1-x})

# 3. make matrix with wt mutations and inverse the values and wild type freqs
msig_inverse <- bind_cols(muts_wt.df, as.data.frame(+(!simple_sigmat.dataframe)))
# 3. make matrix with Others mutations and inverse the values and wild type freqs
msig_inverse <- bind_cols(muts_Others.df, as.data.frame(+(!simple_sigmat.dataframe)))

# 4. apply Others weight
# fixme: it could be this can be implemented in the step above already
msig_inverse[ msig_inverse == 1] <- 1/Others_weight

# fixme: not sure if this really is a nice way to concat those things...
# no it's not you could use dplyr and mutate
muts_all <- c(muts_wt,mutations.vector)
muts_all <- c(muts_Others,mutations.vector)
muts_all.df <- data.frame(muts = unlist(muts_all))

bulk_all <- c(bulk_wt, bulk_freq.vector)
bulk_all <- c(bulk_Others, bulk_freq.vector)
bulk_all.df <- data.frame(freq = unlist(bulk_all))

coverage_all <- c(coverage.vector,coverage.vector)
Expand Down Expand Up @@ -94,10 +98,10 @@ dedupeVariants <- function (variant, variants.df, dedup_variants.df) {
for ( row in dedup_variants.df$variants ){
if ( grepl( row,groupName_variants )) {

# if variants are getting pooled with WT they are just WT and nothing else and also not "others"
if (str_detect(groupName_variants, "WT")){
rownames ( dedup_variants.df )[rownames(dedup_variants.df) == row] <- "WT"
variants_to_drop <- duped_variants[!grepl("WT",duped_variants)]
# if variants are getting pooled with Others they are just Others and nothing else
if (str_detect(groupName_variants, "Others")){
rownames ( dedup_variants.df )[rownames(dedup_variants.df) == row] <- "Others"
variants_to_drop <- duped_variants[!grepl("Others",duped_variants)]
} else{
rownames ( dedup_variants.df )[rownames(dedup_variants.df) == row] <- groupName_variants
variants_to_drop <- NA
Expand Down
59 changes: 26 additions & 33 deletions scripts/report_scripts/variantreport_p_sample.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ get_protein_mut <- function (vepfile){ # parse together from vep output "Protein
#' this function is for parsing the information about mutation position in the amino acid sequence, the reference aa and
#' the alternative mutation into the aa-mutation-notation which is later on comparable to the lists of signature mutations
# reading in whole vep txt output
# reading in whole vep txt output
# TODO: include check about correct VEP file input format
vepfile.df <- read.table(vepfile, sep = ',', header = T) # you should include in the vep script to parse out the #
#in the beginning of the line or include that step here.
# parsing snv and protmut location
Expand Down Expand Up @@ -251,11 +252,14 @@ vep.info <- variant_protein_mut
complete.df <- dplyr::left_join(lofreq.info, vep.info, by = c('gene_mut'='gene_mut'), copy = T) %>%
rowwise() %>% mutate(gene_mut_collapsed = paste(genes,gene_mut,sep = ":"))
# TODO: let the read coverage filter be set dynamically over the setting file
complete_cov_filtered.df <- complete.df %>% filter(!(as.numeric(cov) < 100))
# filter for mutations which are signature mutations
match.df <- complete.df %>%
match.df <- complete_cov_filtered.df %>%
filter(!is.na(name))
# filter for everything that is not a signature mutation
nomatch.df <- complete.df %>%
nomatch.df <- complete_cov_filtered.df %>%
filter(is.na(name))
```

Expand Down Expand Up @@ -354,17 +358,19 @@ dropped_variants <- unique(dropped_variants)
dropped_variants <- dropped_variants[!is.na(dropped_variants)]
```
```{r calculate_sigmat_weigths, message=F, warning=F, include = FALSE, eval = executeDeconvolution}
deconv_lineages <- colnames(msig_simple_unique[,-which( names(msig_simple_unique) %in% c("muts", "WT") )])
deconv_lineages <- colnames(msig_simple_unique[,-which( names(msig_simple_unique) %in% "muts")])
# create list of proportion values that will be used as weigths
sigmut_proportion_weights <- list()
for (lineage in deconv_lineages){
if ( grepl(",", lineage)){
if ( lineage == "Others"){
# !! 17/02/2022 It's not yet tested how robust this behaves when one would mindlessly clutter the mutationsheet
# with lineages that are very unlikely to detect or not detected
value <- nrow(msig_simple_unique) / nrow(sigmuts_deduped)
} else if ( grepl(",", lineage)){
group <- unlist(str_split(lineage, ","))
avrg <- sum(sig_mutations.df$name %in% group) / length(group)
value <- sum(msig_simple_unique[lineage]) / avrg
} else{
} else {
value <- sum(msig_simple_unique[lineage]) / sum(sig_mutations.df$name == lineage)
}
sigmut_proportion_weights[lineage] <- value
Expand All @@ -375,19 +381,19 @@ sigmut_proportion_weights <- as_tibble(sigmut_proportion_weights)
# fixme: there should be a way to do this vectorized
msig_simple_unique_weighted <- msig_simple_unique
for (lineage in deconv_lineages){
msig_simple_unique_weighted[lineage] <- msig_simple_unique_weighted[lineage] / as.numeric(sigmut_proportion_weights[lineage])
weight <- msig_simple_unique_weighted[lineage] / as.numeric(sigmut_proportion_weights[lineage])
msig_simple_unique_weighted[lineage] <- as.numeric( ifelse( is.na(weight),0, unlist(weight) ))
}
```
```{r simulating_WT_mutations, message=F, warning=F, include = FALSE, eval = executeDeconvolution}
```{r simulating_Others_mutations, message=F, warning=F, include = FALSE, eval = executeDeconvolution}
# construct additional WT mutations that are not weighted
msig_stable_all <- simulateWT( mutations.vector, bulk_freq.vector,
Others_weight <- as.numeric(sigmut_proportion_weights["Others"])
msig_stable_all <- simulateOthers( mutations.vector, bulk_freq.vector,
msig_simple_unique_weighted[,-which(names(msig_simple_unique_weighted) == "muts")],
match.df$cov)
match.df$cov,
Others_weight)
msig_stable_unique <- msig_stable_all[[1]]
```

```{r deconvolution, message=F, warning=F, include = FALSE, eval = executeDeconvolution}
Expand All @@ -407,27 +413,14 @@ variant_abundance <- deconv(bulk_all,sig)
"}`

```{r plot, message=F, warning=F, echo = FALSE, eval = executeDeconvolution}
Variant_colors <- c("#8a3874", "#4d3da1", "#62c742", "#13317a", "#86e5a3", "#6f69e5", "#4cdab2", "#999ce4", "#2ce6f3",
"#6638c6", "#005f29", "#5d73cc", "#616524", "#a8a041", "#e9f600", "#3eaca8", "#62a273", "#4ea5d0", "#655c95", "#67a04e")
# TODO this shoul be put into a seperate color palette function in a seperate file
# work in progress...only to show how it theoretically can look like in the report
variants <- colnames(msig_stable_unique[,-1])
df <- data.frame(rbind(variant_abundance))
# XXX: Replace the long column name consisting of concatenated variant
# names with "others" and put the variant names in a label.
# TODO: This should be done much earlier when building the data frame.
condensed_variants_names <-
unlist( lapply(variants, function (x) { if (str_detect(x, ".*,.*,.*")) { "others" } else { x }}) )
colnames(df) <- condensed_variants_names
variants_labels <-
unlist( lapply(variants, function (x) str_replace_all(x, ",", "\n")) )
colnames(df) <- variants
df <- df %>%
tidyr::pivot_longer( everything() )
Expand All @@ -442,9 +435,9 @@ if ( round( sum(df$value),1) == 1 ){
# case 2: in case "others" == 0, both variants can be split up again and being given the value 0 OR
# case 3: in case multiple vars can really not be distinguished from each other they will be distributed normaly
if ( any(str_detect(variants, ",")) ) {
grouped_rows <- which(str_detect(variants, ","))
for ( row in grouped_rows ){
while ( any(str_detect(df$name, ",")) ) {
grouped_rows <- which(str_detect(df$name, ","))
for ( row in grouped_rows ){ # fixme: this loop might be unneccessary, since only the first row should been picked, everything else will be handled by the while loop
if ( df[row,"value"] == 0 ){
grouped_variants <- unlist( str_split(df[row,"name"], ",") )
for ( variant in grouped_variants ){
Expand Down

0 comments on commit d692764

Please sign in to comment.