Skip to content

Commit

Permalink
deduplicate input sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
Chris Warth committed Nov 30, 2015
1 parent 67186ea commit 8f67cf2
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 26 deletions.
21 changes: 3 additions & 18 deletions R/pfitter.r
Original file line number Diff line number Diff line change
Expand Up @@ -287,24 +287,9 @@ add.consensus <- function ( in.fasta, label='sample' ) {
# d <- prep.distances(seq)
# r <- pfitter(d$distances, 2.16e-05, d$seq.length)

prep.distances <- function ( in.fasta, include.gaps.in.Hamming=FALSE ) {
stopifnot(inherits(in.fasta, 'DNAbin'))

# Add the consensus.
.consensus.mat <- matrix( seqinr::consensus( as.character( in.fasta ) ), nrow = 1 );
consensus <- as.DNAbin( .consensus.mat );
rownames( consensus ) <- paste( "CONSENSUS" );

fasta.with.consensus <- rbind( consensus, in.fasta );

# Remove any columns with a consensus that is a gap, which means
# that over half of seqs have gaps. This needs to be removed
# because it becomes not sensible to consider poisson rates of
# insertions. We do however consider rates of deletions, treating
# them as point mutations (by including the indel counts in the
# Hamming distance calculation).
fasta.with.consensus <- fasta.with.consensus[ , .consensus.mat[ 1, ] != "-" ];

prep.distances <- function ( fasta.with.consensus, include.gaps.in.Hamming=FALSE ) {
stopifnot(inherits(fasta.with.consensus, 'DNAbin'))

# The pairwise.deletion = TRUE argument is necessary so that columns with any gaps are not removed.
# The optional second call adds in the count of sites with gap differences
# (gap in one sequence but not the other), which are not included
Expand Down
83 changes: 83 additions & 0 deletions bin/dedup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
make a new BEAST config file by inserting FASTA sequences from the RV217 trail.
The sequence names int the RV217 data look like this:
>RV217_PDB|1M|01WG|NFLG|2011/11/10
The 5th component is the sample data and should eb converted to an appropriate tipdate for BEAST.
usage:
mkbeast.py -p foo template.xml sequences.fasta >beast_in.xml
This will take the fasta sequences and insert them into template.xml
to produce an XML file that is suitable to pass to BEAST.
Once the BEAST config file is generated, you would run,
beast beast_in.xml
This will produce various output files, among which is foo.trees.
That file gets fed to the 'annotatetrees' program and the output of that gets
visualized with 'figtree'.
'''

from __future__ import print_function

import sys
import os.path
import argparse
from Bio import SeqIO

def deduplicate(datafile):
'''
Read sequences from a FASTA file (datafile) and create a nested data structure thet organizaes the sequences by patient and sample date.
'''
seqs = dict()
counts = dict()
with open(datafile, "rU") as handle:
for record in SeqIO.parse(handle, "fasta") :
h = hash(str(record.seq))
if h not in seqs:
seqs[h] = record
counts[h] = 1
else:
counts[h] += 1
for k,r in seqs.items():
r.id = "{}_{}".format(r.id, counts[k])
r.description = ""
yield(r)

def build_parser():
"""
Build the command-line argument parser.
"""
def existing_file(fname):
"""
Argparse type for an existing file
"""
if not os.path.isfile(fname):
raise ValueError("Invalid file: " + str(fname))
return fname

parser = argparse.ArgumentParser(description=__doc__)

parser.add_argument('datafiles', nargs='+', help='FASTA input', type=existing_file)

return parser


def main(args=sys.argv[1:]):

parser = build_parser()
a = parser.parse_args()

for datafile in a.datafiles:
SeqIO.write(deduplicate(datafile), sys.stdout, "fasta")


if __name__ == "__main__":
main(sys.argv[1:])


11 changes: 6 additions & 5 deletions bin/infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,17 +80,18 @@ for sample in "${files[@]}"
do
label=$(basename $sample)
label=${label%.*}
echocmd "dedup.py ${sample} >${outdir}/${label}.fa"

echotty "Creating alignment with PRANK..."
echocmd "prank -d=${sample} -o=${outdir}/prank -quiet -once -f=fasta -showanc -showtree -showevents -DNA >${outdir}/prankcmd.log 2>&1"
echocmd "prank -d=${outdir}/${label}.fa -o=${outdir}/prank -quiet -once -f=fasta -showanc -showtree -showevents -DNA >${outdir}/prankcmd.log 2>&1"

echotty "Testing for multiple founders..."
moi=$(keele ${sample})
moi=$(keele ${outdir}/${label}.fa)
if [[ "${moi}" == *"multiple infection"* ]]
then
# multiple founders
# split just below the root and sequences separately.
echocmd "treesplit.py -o ${outdir} ${outdir}/prank.best.dnd ${sample}"
echocmd "treesplit.py -o ${outdir} ${outdir}/prank.best.dnd ${outdir}/${label}.fa"
echocmd "prank -d=${outdir}/left.fasta -o=${outdir}/left -quiet -once -f=fasta -showanc -showtree -showevents -DNA >${outdir}/prankcmd.log 2>&1"
echocmd "prank -d=${outdir}/right.fasta -o=${outdir}/right -quiet -once -f=fasta -showanc -showtree -showevents -DNA >${outdir}/prankcmd.log 2>&1"

Expand Down Expand Up @@ -125,15 +126,15 @@ do
echocmd "beast -working -overwrite -beagle ${outdir}/beast_in.xml >${outdir}/beastcmd.log 2>&1"

echotty 'Extracting estimated time of infection'
echocmd 'posterior_toi.py ${outdir}/beastout.log $sample'
echocmd 'posterior_toi.py ${outdir}/beastout.log ${outdir}/${label}.fa'

# echotty "Exracting guide tree..."
# echocmd "treeannotator ${outdir}/beastout.trees > ${outdir}/mcc.tree 2>${outdir}/treeannotator.log"

# tr "/" "-" < ${outdir}/mcc.tree | nexus2newick.py | tr "-" "/" | tr -d "'" > ${outdir}/guidetree.tree

# echotty "Inferring ancestral states with PRANK..."
# echocmd "prank -d=${sample} -t=${outdir}/guidetree.tree -o=${outdir}/prank -quiet -once -f=fasta -showanc -showtree -showevents -codon >${outdir}/prankcmd.log 2>&1"
# echocmd "prank -d=${outdir}/${label}.fa -t=${outdir}/guidetree.tree -o=${outdir}/prank -quiet -once -f=fasta -showanc -showtree -showevents -codon >${outdir}/prankcmd.log 2>&1"

done

39 changes: 36 additions & 3 deletions bin/keele
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# a 1 exit code for multiple infections.

if (!suppressMessages(require("pacman"))) install.packages("pacman")
pacman::p_load(optparse)
pacman::p_load(optparse,tidyr, dplyr)

search.source <- function(file, path=c('.', 'R'), ...)
{
Expand All @@ -23,13 +23,44 @@ select.early <- function(sequences) {
date <- rownames(sequences) %>%
strsplit('|', fixed=TRUE) %>%
rapply(function(x) tail(x, 1)) %>%
(function(l) sub("_\\d+$", "", l)) %>%
as.Date()

early <- date %>% unique() %>% sort() %>% head(1)
sequences[date==early,]

}

# add a consensus sequence.
# Useful for formatting a fasta file in a form acceptable
# to the PFitter tool at http://www.hiv.lanl.gov/content/sequence/POISSON_FITTER/pfitter.html
#
# Typical usage:
# read.dna( fasta.file, format = "fasta" ) %>%
# add.consensus( label=basename(file_path_sans_ext(fasta.file))) %>%
# write.dna('output2.fa', format = "fasta", nbcol=-1, colsep = "",indent = 0,blocksep = 0)
#
add.consensus <- function ( in.fasta, label='Consensus' ) {
.consensus.mat <- matrix( seqinr::consensus( as.character( in.fasta ) ), nrow = 1 );
consensus <- as.DNAbin( .consensus.mat );
rownames( consensus ) <- label

rbind( consensus, in.fasta );
}

filter.gaps <- function(in.fasta, consensus='Consensus') {
# Remove any columns with a consensus that is a gap, which means
# that over half of seqs have gaps. This needs to be removed
# because it becomes not sensible to consider poisson rates of
# insertions. We do however consider rates of deletions, treating
# them as point mutations (by including the indel counts in the
# Hamming distance calculation).
in.fasta <- in.fasta[,in.fasta[1,] != '-']

return(in.fasta)
}


search.source('utils.r')
search.source('pfitter.r')

Expand All @@ -40,7 +71,7 @@ parser <- OptionParser(usage='keele [options] <fastafile>') %>%
help="Include site with gaps in distace measure [default %default]. This effectively adds in a new distance matrix calculated with 'indel' model in dist.dna().",
dest="include.gaps")

# a <- parse_args(parser, args = c('rv217_anon_1w_44.fasta'), positional_arguments = TRUE)
# a <- parse_args(parser, args = c('sample_aln.fa'), positional_arguments = TRUE)

a <- parse_args(parser, positional_arguments = TRUE)
file <- a$args
Expand All @@ -57,7 +88,9 @@ if (file.access(file) == -1)

d <- read.dna( file, format = "fasta" ) %>%
select.early() %>%
prep.distances(include.gaps.in.Hamming=opt$include.gaps )
add.consensus() %>%
filter.gaps() %>%
prep.distances(include.gaps.in.Hamming=opt$include.gaps)

r <- pfitter(d$distances, opt$rate, d$seq.length)

Expand Down
3 changes: 3 additions & 0 deletions bin/mkbeast_rv217.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ def processFasta(datafile):
patientId = fields[0]
timePoint = fields[1] if len(fields) > 0 else "0"
sampleDate = fields[4] if len(fields) > 3 else "0"
# if the sequence name ends with "_<digits>", assume this is a multiplicity indicator
# and strip it off before converting the date.
sampleDate = re.sub(r"_\d+$", '', sampleDate)
taxon = record

collectiondate = patient[sampleDate]
Expand Down

0 comments on commit 8f67cf2

Please sign in to comment.