merge deconvolution improvements

BIMSBbioinfo · May 5, 2022 · d692764 · d692764
2 parents c56146a + 342e340
commit d692764
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 52 deletions.
diff --git a/Makefile.am b/Makefile.am
@@ -13,6 +13,7 @@ dist_pkglibexec_scripts_SCRIPTS = \
   scripts/deconvolution.R \
   scripts/download_databases.sh \
   scripts/generateNavigation.R \
+  scripts/renderReport.R \
   scripts/get_qc_table.py \
   scripts/overview_QC_table.R \
   scripts/parse_vep.py \

diff --git a/configure.ac b/configure.ac
@@ -48,7 +48,7 @@ find_or_override_prog([KRAKEN2],         [kraken2])
 find_or_override_prog([KRAKEN2_BUILD],   [kraken2-build])
 find_or_override_prog([IMPORT_KRONA],    [ktImportKrona])
 find_or_override_prog([IMPORT_TAXONOMY], [ktImportTaxonomy])
-find_or_override_prog([IVAR],            [ivar])
+find_or_override_prog([IVAR],            [ivar trim])
 find_or_override_prog([LOFREQ],          [lofreq])
 find_or_override_prog([VEP],             [vep])
 find_or_override_prog([MULTIQC],         [multiqc])

diff --git a/etc/settings.yaml.in b/etc/settings.yaml.in
@@ -68,7 +68,7 @@ tools:
     args: ""
   ivar:
     executable: @IVAR@
-    args: ""
+    args: "trim -q 15 -m 180 -s 4 -e"
   lofreq:
     executable: @LOFREQ@
     args: ""

diff --git a/scripts/deconvolution.R b/scripts/deconvolution.R
@@ -11,9 +11,9 @@ createSigMatrix <- function ( mutations.vector, mutation_sheet ) {
       mutations.df <- mutations.df[,-(which(names(mutations.df) %in% "source"))]
   }
   # create an empty data frame add a column for the Wildtype
-  # Wildtype in this case means the reference version of SARS-Cov-2
+  # "Others" means that the particular mutation is not found and the mutation site could be mutated otherwise or not at all
 
-  msig <- setNames( data.frame( matrix( ncol = ncol(mutations.df)+1, nrow = 0 )), c("WT", colnames(mutations.df)))
+  msig <- setNames( data.frame( matrix( ncol = ncol(mutations.df)+1, nrow = 0 )), c("Others", colnames(mutations.df)))
   msig <- bind_rows(tibble(muts=mutations.vector), msig)
   # making a matrix with the signature mutations found in the sample
   # make binary matrix matching the mutations to the mutation-list per variant to see how many characterising mutations
@@ -27,26 +27,30 @@ createSigMatrix <- function ( mutations.vector, mutation_sheet ) {
   return( msig[,-match('muts', names(msig))]*1 ) # use the *1 to turn true/false to 0/1
 }
 
-simulateWT <- function ( mutations.vector, bulk_freq.vector, simple_sigmat.dataframe, coverage.vector) {
-  # for the deconvolution to work we need the "wild type" frequencies too. The matrix from above got mirrored, 
-  # wild type mutations are simulated the following: e.g. T210I (mutation) -> T210T ("wild type")
+simulateOthers <- function ( mutations.vector, bulk_freq.vector, simple_sigmat.dataframe, coverage.vector, Others_weight) {
+  #' for the deconvolution to work we need the "wild type" frequencies too. The matrix from above got mirrored, 
+  #' wild type mutations are simulated the following: e.g. T210I (mutation) -> T210T ("wild type")
 
-  # 1. make "WT mutations" 
-  muts_wt <- lapply(mutations.vector,function(x) str_replace(x,regex(".$"), 
+  # 1. make "Others mutations" 
+  muts_Others <- lapply(mutations.vector,function(x) str_replace(x,regex(".$"), 
                                                              str_sub(str_split(x,":")[[1]][2], 1,1)))
-  muts_wt.df <- data.frame(muts = unlist(muts_wt))
+  muts_Others.df <- data.frame(muts = unlist(muts_Others))
   # 2. make frequency values, subtract the observed freqs for the real mutations from 1
-  bulk_wt <- lapply(bulk_freq.vector, function (x) {1-x})
+  bulk_Others <- lapply(bulk_freq.vector, function (x) {1-x})
 
-  # 3. make matrix with wt mutations and inverse the values and wild type freqs
-  msig_inverse <- bind_cols(muts_wt.df, as.data.frame(+(!simple_sigmat.dataframe)))
+  # 3. make matrix with Others mutations and inverse the values and wild type freqs
+  msig_inverse <- bind_cols(muts_Others.df, as.data.frame(+(!simple_sigmat.dataframe)))
+
+  # 4. apply Others weight 
+  # fixme: it could be this can be implemented in the step above already
+  msig_inverse[ msig_inverse == 1] <- 1/Others_weight
 
   # fixme: not sure if this really is a nice way to concat those things...
     # no it's not you could use dplyr and mutate
-  muts_all <- c(muts_wt,mutations.vector)
+  muts_all <- c(muts_Others,mutations.vector)
   muts_all.df <- data.frame(muts = unlist(muts_all))
 
-  bulk_all <- c(bulk_wt, bulk_freq.vector)
+  bulk_all <- c(bulk_Others, bulk_freq.vector)
   bulk_all.df <- data.frame(freq = unlist(bulk_all))
 
   coverage_all <- c(coverage.vector,coverage.vector)
@@ -94,10 +98,10 @@ dedupeVariants <- function (variant, variants.df, dedup_variants.df) {
         for ( row in dedup_variants.df$variants ){
           if ( grepl( row,groupName_variants )) {
 
-            # if variants are getting pooled with WT they are just WT and nothing else and also not "others"
-            if (str_detect(groupName_variants, "WT")){
-              rownames ( dedup_variants.df )[rownames(dedup_variants.df) == row] <- "WT"
-              variants_to_drop <- duped_variants[!grepl("WT",duped_variants)]
+            # if variants are getting pooled with Others they are just Others and nothing else
+            if (str_detect(groupName_variants, "Others")){
+              rownames ( dedup_variants.df )[rownames(dedup_variants.df) == row] <- "Others"
+              variants_to_drop <- duped_variants[!grepl("Others",duped_variants)]
             } else{
               rownames ( dedup_variants.df )[rownames(dedup_variants.df) == row] <- groupName_variants
               variants_to_drop <- NA

diff --git a/scripts/report_scripts/variantreport_p_sample.Rmd b/scripts/report_scripts/variantreport_p_sample.Rmd
@@ -134,7 +134,8 @@ get_protein_mut <- function (vepfile){ # parse together from vep output "Protein
   #' this function is for parsing the information about mutation position in the amino acid sequence, the reference aa and
   #' the alternative mutation into the aa-mutation-notation which is later on comparable to the lists of signature mutations
   
-  # reading in whole vep txt output
+  # reading in whole vep txt output 
+    # TODO: include check about correct VEP file input format
     vepfile.df <- read.table(vepfile, sep = ',', header = T) # you should include in the vep script to parse out the # 
       #in the beginning of the line or include that step here. 
      # parsing snv and protmut location
@@ -251,11 +252,14 @@ vep.info <- variant_protein_mut
 complete.df <- dplyr::left_join(lofreq.info, vep.info, by = c('gene_mut'='gene_mut'), copy = T) %>% 
                rowwise() %>% mutate(gene_mut_collapsed = paste(genes,gene_mut,sep = ":"))
 
+# TODO: let the read coverage filter be set dynamically over the setting file 
+complete_cov_filtered.df <- complete.df %>% filter(!(as.numeric(cov) < 100))
+
 # filter for mutations which are signature mutations
-match.df <- complete.df %>%
+match.df <- complete_cov_filtered.df %>%
                     filter(!is.na(name))
 # filter for everything that is not a signature mutation
-nomatch.df <- complete.df %>%
+nomatch.df <- complete_cov_filtered.df %>%
                         filter(is.na(name))
 ```
 
@@ -354,17 +358,19 @@ dropped_variants <- unique(dropped_variants)
 dropped_variants <- dropped_variants[!is.na(dropped_variants)]
 ```
 ```{r calculate_sigmat_weigths, message=F, warning=F, include = FALSE, eval = executeDeconvolution}
-
-deconv_lineages <- colnames(msig_simple_unique[,-which( names(msig_simple_unique) %in% c("muts", "WT") )])
-
+deconv_lineages <- colnames(msig_simple_unique[,-which( names(msig_simple_unique) %in% "muts")])
 # create list of proportion values that will be used as weigths
 sigmut_proportion_weights <- list()
 for (lineage in deconv_lineages){
-    if ( grepl(",", lineage)){
+    if ( lineage == "Others"){
+        # !! 17/02/2022 It's not yet tested how robust this behaves when one would mindlessly clutter the mutationsheet
+        # with lineages that are very unlikely to detect or not detected
+        value <- nrow(msig_simple_unique) / nrow(sigmuts_deduped)
+    } else if ( grepl(",", lineage)){
         group <- unlist(str_split(lineage, ","))
         avrg <- sum(sig_mutations.df$name %in% group) / length(group)
         value <- sum(msig_simple_unique[lineage]) / avrg
-    } else{
+    } else {
         value <- sum(msig_simple_unique[lineage]) / sum(sig_mutations.df$name == lineage)
     }
     sigmut_proportion_weights[lineage] <- value
@@ -375,19 +381,19 @@ sigmut_proportion_weights <- as_tibble(sigmut_proportion_weights)
 # fixme: there should be a way to do this vectorized
 msig_simple_unique_weighted <- msig_simple_unique
 for (lineage in deconv_lineages){
-    msig_simple_unique_weighted[lineage] <- msig_simple_unique_weighted[lineage] / as.numeric(sigmut_proportion_weights[lineage])
+    weight <- msig_simple_unique_weighted[lineage] / as.numeric(sigmut_proportion_weights[lineage])
+    msig_simple_unique_weighted[lineage] <- as.numeric( ifelse( is.na(weight),0, unlist(weight) ))
+    
 }
-
 ```
-```{r simulating_WT_mutations, message=F, warning=F, include = FALSE, eval = executeDeconvolution}
-
+```{r simulating_Others_mutations, message=F, warning=F, include = FALSE, eval = executeDeconvolution}
 # construct additional WT mutations that are not weighted
-
-msig_stable_all <- simulateWT( mutations.vector, bulk_freq.vector, 
+Others_weight <- as.numeric(sigmut_proportion_weights["Others"])
+msig_stable_all <- simulateOthers( mutations.vector, bulk_freq.vector,
                                msig_simple_unique_weighted[,-which(names(msig_simple_unique_weighted) == "muts")],
-                                match.df$cov)
+                                match.df$cov,
+                                Others_weight)
 msig_stable_unique <- msig_stable_all[[1]]
-
 ```
 
 ```{r deconvolution, message=F, warning=F, include = FALSE, eval = executeDeconvolution}
@@ -407,27 +413,14 @@ variant_abundance <- deconv(bulk_all,sig)
 "}` 
 
 ```{r plot, message=F, warning=F, echo = FALSE, eval = executeDeconvolution}
-
 Variant_colors <- c("#8a3874", "#4d3da1", "#62c742", "#13317a", "#86e5a3", "#6f69e5", "#4cdab2", "#999ce4", "#2ce6f3",
 "#6638c6", "#005f29", "#5d73cc", "#616524", "#a8a041", "#e9f600", "#3eaca8", "#62a273", "#4ea5d0", "#655c95", "#67a04e")
 # TODO this shoul be put into a seperate color palette function in a seperate file
 
-# work in progress...only to show how it theoretically can look like in the report
 variants <- colnames(msig_stable_unique[,-1])
 df <- data.frame(rbind(variant_abundance))
 
-# XXX: Replace the long column name consisting of concatenated variant
-# names with "others" and put the variant names in a label.
-# TODO: This should be done much earlier when building the data frame.
-
-condensed_variants_names <-
-    unlist( lapply(variants, function (x) { if (str_detect(x, ".*,.*,.*")) { "others" } else { x }}) )
-
-colnames(df) <- condensed_variants_names
-
-variants_labels <-
-    unlist( lapply(variants, function (x) str_replace_all(x, ",", "\n")) )
-
+colnames(df) <- variants
 df <- df %>% 
     tidyr::pivot_longer( everything() )
 
@@ -442,9 +435,9 @@ if ( round( sum(df$value),1) == 1 ){
 
 # case 2: in case "others" == 0, both variants can be split up again and being given the value 0 OR
 # case 3: in case multiple vars can really not be distinguished from each other they will be distributed normaly
-if ( any(str_detect(variants, ",")) ) {
-    grouped_rows <- which(str_detect(variants, ","))
-    for ( row in grouped_rows ){
+while ( any(str_detect(df$name, ",")) ) {
+    grouped_rows <- which(str_detect(df$name, ","))
+    for ( row in grouped_rows ){ # fixme: this loop might be unneccessary, since only the first row should been picked, everything else will be handled by the while loop
        if ( df[row,"value"] == 0 ){ 
             grouped_variants <- unlist( str_split(df[row,"name"], ",") )
             for ( variant in grouped_variants ){