-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_VEPvcf_parser.R
73 lines (63 loc) · 3.76 KB
/
example_VEPvcf_parser.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# This is an example script, meant for importing vcf files that are chunked over different regions of the genome (or separate INDEL and SNP vcfs),
# but NOT multiple vcfs of same region from different samples, as it assumes none of the variants are shared between vcf files.
# Also it's only tested on one-alt-allele-per-line vcfs, not comma-separated alt alleles
rbind_VEP_VCF_files <- function(vcfFiles,
sampleNames = NULL, ...){ # if not given, will use vcf file name stems as the sampleNames
# decide what sample short names should be, based on filenames if not specified in sampleNames
if(!is.null(sampleNames)){
if(length(sampleNames) != length(vcfFiles)){
stop('sampleNames, if specified, must have same number of elements as vcfFiles\n')
}
filesXsamps = data.frame(file=vcfFiles, samp = sampleNames)
} else {
filesXsamps = data.frame(file=vcfFiles)
filesXsamps$samp = vcfFiles %>%
base::basename() %>%
stringr::str_remove('.gz$') %>%
stringr::str_remove('.vcf$')
}
filesXsamps$samp2 <- make.names(filesXsamps$samp, unique = T)
# import vcf files and parse VEP parts of INFO column
vcfs_imp_l <- readMultiVcfs(vcfFiles = vcfFiles,
sampleNames = filesXsamps$samp2,
sampleNameColumn="SAMPLEID", # SAMPLEID is default colname produced by mutSignatures::importVCFfiles (to maintain compatibility)
filterIn = 'PASS', # Remove rows unless PASS in filter field to reduce size of table. Default = no pre-filter
parseAnno = c('VEP','SNPEFF')) # Assuming that the vcf file contains both VEP and SNPEFF output
# It's a good idea to check that all vep colnames are the same, since we imported different files
common_cols <- Reduce(intersect, vcfs_imp_l$vepColnames)
all_cols <- vcfs_imp_l$vepColnames %>% unlist() %>% unique()
cat('\n Shared VEP field names: ',paste0(common_cols, sep=','),'\n')
if(length(all_cols) > length(common_cols)) {
warning('There were some VEP fieldnames not present in all files.')
}
# rbind all the results from different vcf files
cat('\n rbinding vcf tables / matrices \n')
vcfs_imp <- do.call(rbind, vcfs_imp_l$allVcfs)
samps_imp <- do.call(rbind, vcfs_imp_l$allSamps)
info_imp <- do.call(rbind, vcfs_imp_l$allInfo)
# check if order somehow messed up
if(all(vcfs_imp$SAMPLEID == samps_imp$SAMPLEID) & all(rownames(vcfs_imp) == rownames(samps_imp))){
vcfs_imp %<>% cbind(., samps_imp) #<--- only works if patient / column names are compatible and proper names
} else {
warning('Sample order somehow messed up? Not cbinding vcfs with samples.')
}
# rbind all the fmats matrices of sample-specific values
fmats <- list()
for(nm in names(vcfs_imp_l$fmats)){
fmats[[nm]] <- do.call(rbind, vcfs_imp_l$fmats[[nm]])
}
# check rownames
x <- lapply(fmats, rownames) %>% as.data.frame()
if(!all(apply(x, 1, function(x){length(unique(x)) == 1}))){
warning('Scrambled rows in fmats.')
}
cat('\n Finished rbinding \n')
# join VEP / snpEff lists from different files
conCat_veps <- do.call(c, vcfs_imp_l$allVEPmats)
conCat_snpEffs <- do.call(c, vcfs_imp_l$allSnpEffmats)
# Melt VEP / snpEff matrics.
cat('\n Melting list of matrices to long format \n')
long_vep_effects_mat <- melt_VEP(veps = conCat_veps, vcf = vcfs_imp, onlyGenes = T, outCols = c('Allele','Consequence', 'IMPACT','SYMBOL','Gene'))
long_snpEff_effects_mat <- melt_VEP(veps = conCat_snpEffs, vcf = vcfs_imp, onlyGenes = T, outCols = c('Allele', 'Annotation', 'Annotation_Impact','Gene_Name'))
return(list(vcfs_imp = vcfs_imp, fmats = fmats, long_snpEff_effects_mat = long_snpEff_effects_mat, long_vep_effects_mat = long_vep_effects_mat))
}