diff --git a/R/gabowmyers.R b/R/gabowmyers.R index eeff2e1..cb96d16 100644 --- a/R/gabowmyers.R +++ b/R/gabowmyers.R @@ -211,7 +211,7 @@ prepareGraphForGabowMyers <- function(w, chains, input_data) { #' @import tidyr #' @param w matrix of CCF values (rows = clusters, columns = samples) #' @return graph_G tibble of possible edges with columns edge, parent, child -prepareGraph <- function(w_mat) { +prepareGraph <- function(w_mat, thresh) { graph_pre <- data.frame(edge = character(), parent = character(), child = character()) for (i in seq_len(nrow(w_mat))) { graph_pre <- graph_pre %>% add_row(edge = paste("root->", i, sep = ""), parent = "root", child = as.character(i)) @@ -219,7 +219,7 @@ prepareGraph <- function(w_mat) { if (i!=j) { i_row = w_mat[i, ] j_row = w_mat[j, ] - if (all(j_row[i_row > 0] > 0)) { + if (all(j_row-i_row > -thresh)) { graph_pre <- graph_pre %>% add_row(edge = paste(j, "->", i, sep = ""), parent = as.character(j), child = as.character(i)) } } @@ -261,7 +261,7 @@ enumerateSpanningTrees <- function(graph_G, w, sum_filter_thresh=0.2) { generateAllTrees <- function(w, lineage_precedence_thresh=0.1, sum_filter_thresh=0.2) { w_mat <- estimateCCFs(w) w_mat <- assign("w_mat", w_mat, envir = .GlobalEnv) - graph_G_pre <- prepareGraph(w_mat) + graph_G_pre <- prepareGraph(w_mat, lineage_precedence_thresh) graph_G <- filterEdgesBasedOnCCFs(graph_G_pre, w_mat, thresh = lineage_precedence_thresh) graph_G <- assign("graph_G", graph_G, envir = .GlobalEnv) enumerateSpanningTreesModified(graph_G, w_mat, sum_filter_thresh = sum_filter_thresh) diff --git a/R/preprocessing.R b/R/preprocessing.R index af5b733..27b2419 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2,7 +2,7 @@ #' #' @export #' @param input_file input data file; -importCSV <- function(inputFile) { +importCSV <- function(inputFile, alt_reads_thresh = 6, vaf_thresh = 0.02) { data <- read_csv(inputFile, show_col_types = FALSE) output_data <- list() @@ -36,7 +36,7 @@ importCSV <- function(inputFile) { output_data$n[output_data$n==0] <- round(mean(output_data$n)) } - output_data$tcn <- as.matrix(data[c("mutation", "sample", "tumor_integer_copy_number")] %>% pivot_wider(names_from = sample, values_from = tumor_integer_copy_number, values_fill = 0)) + output_data$tcn <- as.matrix(data[c("mutation", "sample", "tumor_integer_copy_number")] %>% pivot_wider(names_from = sample, values_from = tumor_integer_copy_number, values_fill = 2)) rownames(output_data$tcn) <- output_data$tcn[,'mutation'] output_data$tcn <- output_data$tcn[,-1, drop=FALSE] rowname = rownames(output_data$tcn) @@ -45,7 +45,7 @@ importCSV <- function(inputFile) { rownames(output_data$tcn) = rowname colnames(output_data$tcn) = colname - output_data$purity <- as.matrix(data[c("mutation", "sample", "purity")] %>% pivot_wider(names_from = sample, values_from = purity)) + output_data$purity <- as.matrix(data[c("mutation", "sample", "purity")] %>% pivot_wider(names_from = sample, values_from = purity, values_fill = 0)) rownames(output_data$purity) <- output_data$purity[,'mutation'] output_data$purity <- output_data$purity[,-1, drop=FALSE] rowname = rownames(output_data$purity) @@ -53,21 +53,22 @@ importCSV <- function(inputFile) { output_data$purity <- matrix(as.numeric(output_data$purity), ncol = ncol(output_data$purity)) rownames(output_data$purity) = rowname colnames(output_data$purity) = colname - if (ncol(output_data$purity) == 1) { - if (length(unique(output_data$purity[,])) != 1) { - warning("purity not consistent for the same sample; taking the mean purity") - output_data$purity <- round(colMeans(output_data$purity), digit=2) - } else { - output_data$purity <- unique(output_data$purity[,]) - } - } else { - if (nrow(unique(output_data$purity[,])) != 1) { - warning("Purity not consistent for the same sample; taking the mean purity") - output_data$purity <- round(colMeans(output_data$purity), digit=2) - } else { - output_data$purity <- unique(output_data$purity[,])[1,] - } - } + output_data$purity = colSums(output_data$purity) / colSums(!!output_data$purity) + # if (ncol(output_data$purity) == 1) { + # if (length(unique(output_data$purity[,])) != 1) { + # warning("purity not consistent for the same sample; taking the mean purity") + # output_data$purity <- round(colMeans(output_data$purity), digit=2) + # } else { + # output_data$purity <- unique(output_data$purity[,]) + # } + # } else { + # if (nrow(unique(output_data$purity[,])) != 1) { + # warning("Purity not consistent for the same sample; taking the mean purity") + # output_data$purity <- round(colMeans(output_data$purity), digit=2) + # } else { + # output_data$purity <- unique(output_data$purity[,])[1,] + # } + # } output_data$S = ncol(output_data$y) output_data$I = nrow(output_data$y) @@ -88,6 +89,8 @@ importCSV <- function(inputFile) { output_data$m <- estimateMultiplicityMatrix(output_data) } + output_data$y[output_data$y / output_data$n < vaf_thresh] = 0 + output_data$y[output_data$y < alt_reads_thresh] = 0 return(output_data) } diff --git a/vignettes/pictograph.Rmd b/vignettes/pictograph.Rmd index 006749c..34bbe5d 100644 --- a/vignettes/pictograph.Rmd +++ b/vignettes/pictograph.Rmd @@ -31,10 +31,11 @@ The required user input is a csv file that contains at least columns named "samp head(read.csv(system.file('extdata/example_input.csv', package = 'pictograph'))) ``` -The input file can be read using the importCSV function. Alt read count (y), total read count (n), tumor copy number (tcn), and multiplicity (m) are stored in matrices where the columns are samples, and rows are variants. Purity is supplied as a vector. I and S are integers representing the number of variants and number of samples, respectively. +The input file can be read using the importCSV function. Alt read count (y), total read count (n), tumor copy number (tcn), and multiplicity (m) are stored in matrices where the columns are samples, and rows are variants. Purity is supplied as a vector. I and S are integers representing the number of variants and number of samples, respectively. The function will also pre-process the data to remove potential false positive read counts. By default, a mutation with alt reads less than 6 or a VAF less than 2% is considered a false positive. Users can change the thresholds using the `alt_reads_thresh` and `vaf_thresh` parameters. ```{r input_data} -input_data <- importCSV(system.file('extdata/example_input.csv', package = 'pictograph')) +input_data <- importCSV(system.file('extdata/example_input.csv', package = 'pictograph'), + alt_reads_thresh = 6, vaf_thresh = 0.02) ``` ```{r load_results, include = F}