deduplicate input sequences

matsengrp · Nov 30, 2015 · 8f67cf2 · 8f67cf2
1 parent 67186ea
commit 8f67cf2
Show file tree

Hide file tree

Showing 5 changed files with 131 additions and 26 deletions.
diff --git a/R/pfitter.r b/R/pfitter.r
@@ -287,24 +287,9 @@ add.consensus <- function ( in.fasta, label='sample' ) {
 #     d <- prep.distances(seq)
 #     r <- pfitter(d$distances, 2.16e-05, d$seq.length)
 
-prep.distances <- function ( in.fasta, include.gaps.in.Hamming=FALSE ) {
-    stopifnot(inherits(in.fasta, 'DNAbin'))
-
-    # Add the consensus.
-    .consensus.mat <- matrix( seqinr::consensus( as.character( in.fasta ) ), nrow = 1 );
-    consensus <- as.DNAbin( .consensus.mat );
-    rownames( consensus ) <- paste( "CONSENSUS" );
-
-    fasta.with.consensus <- rbind( consensus, in.fasta );
-
-    # Remove any columns with a consensus that is a gap, which means
-    # that over half of seqs have gaps.  This needs to be removed
-    # because it becomes not sensible to consider poisson rates of
-    # insertions.  We do however consider rates of deletions, treating
-    # them as point mutations (by including the indel counts in the
-    # Hamming distance calculation).
-    fasta.with.consensus <- fasta.with.consensus[ , .consensus.mat[ 1, ] != "-" ];
-
+prep.distances <- function ( fasta.with.consensus, include.gaps.in.Hamming=FALSE ) {
+    stopifnot(inherits(fasta.with.consensus, 'DNAbin'))
+
     # The pairwise.deletion = TRUE argument is necessary so that columns with any gaps are not removed.
     # The optional second call adds in the count of sites with gap differences
     # (gap in one sequence but not the other), which are not included

diff --git a/bin/dedup.py b/bin/dedup.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+'''
+    make a new BEAST  config file by inserting FASTA sequences from the RV217 trail.
+    The sequence names int the RV217 data look like this:
+	>RV217_PDB|1M|01WG|NFLG|2011/11/10
+
+    The 5th component is the sample data and should eb converted to an appropriate tipdate for BEAST.
+
+    usage:
+
+         mkbeast.py -p foo template.xml sequences.fasta  >beast_in.xml
+
+    This will take the fasta sequences and insert them into template.xml
+    to produce an XML file that is suitable to pass to BEAST.  
+
+    Once the BEAST config file is generated, you would run,
+
+         beast beast_in.xml
+
+    This will produce various output files, among which is foo.trees.
+    That file gets fed to the 'annotatetrees' program and the output of that gets
+    visualized with 'figtree'.
+'''
+
+from __future__ import print_function
+
+import sys
+import os.path
+import argparse
+from Bio import SeqIO
+
+def deduplicate(datafile):
+    '''
+    Read sequences from a FASTA file (datafile) and create a nested data structure thet organizaes the sequences by patient and sample date.
+    '''
+    seqs = dict()
+    counts = dict()
+    with open(datafile, "rU") as handle:
+        for record in SeqIO.parse(handle, "fasta") :
+            h = hash(str(record.seq))
+            if h not in seqs:
+                seqs[h] = record
+                counts[h] = 1
+            else:
+                counts[h] += 1
+    for k,r in seqs.items():
+        r.id = "{}_{}".format(r.id, counts[k])
+        r.description = ""
+        yield(r)
+
+def build_parser():
+    """
+    Build the command-line argument parser.
+    """
+    def existing_file(fname):
+        """
+        Argparse type for an existing file
+        """
+        if not os.path.isfile(fname):
+            raise ValueError("Invalid file: " + str(fname))
+        return fname
+
+    parser = argparse.ArgumentParser(description=__doc__)
+
+    parser.add_argument('datafiles', nargs='+', help='FASTA input', type=existing_file)
+
+    return parser
+
+
+def main(args=sys.argv[1:]):
+
+    parser = build_parser()
+    a = parser.parse_args()
+
+    for datafile in a.datafiles:
+        SeqIO.write(deduplicate(datafile), sys.stdout, "fasta")
+
+
+if __name__ == "__main__":
+   main(sys.argv[1:])
+
+
diff --git a/bin/infer.sh b/bin/infer.sh
@@ -80,17 +80,18 @@ for sample in "${files[@]}"
 do
     label=$(basename $sample)
     label=${label%.*}
+    echocmd "dedup.py ${sample} >${outdir}/${label}.fa"
 
     echotty "Creating alignment with PRANK..." 
-    echocmd "prank -d=${sample} -o=${outdir}/prank -quiet -once -f=fasta -showanc -showtree -showevents -DNA >${outdir}/prankcmd.log 2>&1"
+    echocmd "prank -d=${outdir}/${label}.fa -o=${outdir}/prank -quiet -once -f=fasta -showanc -showtree -showevents -DNA >${outdir}/prankcmd.log 2>&1"
 
     echotty "Testing for multiple founders..."
-    moi=$(keele ${sample})
+    moi=$(keele ${outdir}/${label}.fa)
     if [[ "${moi}" == *"multiple infection"* ]]
     then
 	# multiple founders
 	# split just below the root and sequences separately.
-	echocmd "treesplit.py -o  ${outdir} ${outdir}/prank.best.dnd ${sample}"
+	echocmd "treesplit.py -o  ${outdir} ${outdir}/prank.best.dnd ${outdir}/${label}.fa"
 	echocmd "prank -d=${outdir}/left.fasta -o=${outdir}/left -quiet -once -f=fasta -showanc -showtree -showevents -DNA >${outdir}/prankcmd.log 2>&1"
 	echocmd "prank -d=${outdir}/right.fasta -o=${outdir}/right -quiet -once -f=fasta -showanc -showtree -showevents -DNA >${outdir}/prankcmd.log 2>&1"
 
@@ -125,15 +126,15 @@ do
     echocmd "beast -working -overwrite -beagle ${outdir}/beast_in.xml >${outdir}/beastcmd.log  2>&1"
 
     echotty 'Extracting estimated time of infection'
-    echocmd 'posterior_toi.py ${outdir}/beastout.log $sample'
+    echocmd 'posterior_toi.py ${outdir}/beastout.log ${outdir}/${label}.fa'
 
     # echotty "Exracting guide tree..." 
     # echocmd "treeannotator ${outdir}/beastout.trees > ${outdir}/mcc.tree 2>${outdir}/treeannotator.log"
 
     # tr "/" "-" < ${outdir}/mcc.tree | nexus2newick.py | tr "-" "/" | tr -d "'" > ${outdir}/guidetree.tree
 
     # echotty "Inferring ancestral states with PRANK..." 
-    # echocmd "prank -d=${sample} -t=${outdir}/guidetree.tree -o=${outdir}/prank -quiet -once -f=fasta -showanc -showtree -showevents -codon >${outdir}/prankcmd.log 2>&1"
+    # echocmd "prank -d=${outdir}/${label}.fa -t=${outdir}/guidetree.tree -o=${outdir}/prank -quiet -once -f=fasta -showanc -showtree -showevents -codon >${outdir}/prankcmd.log 2>&1"
 
 done
 
diff --git a/bin/keele b/bin/keele
@@ -7,7 +7,7 @@
 #  a 1 exit code for multiple infections.
 
 if (!suppressMessages(require("pacman"))) install.packages("pacman")
-pacman::p_load(optparse)
+pacman::p_load(optparse,tidyr, dplyr)
 
 search.source <- function(file, path=c('.', 'R'), ...) 
 { 
@@ -23,13 +23,44 @@ select.early <- function(sequences) {
     date <- rownames(sequences) %>%
         strsplit('|', fixed=TRUE) %>%
         rapply(function(x) tail(x, 1)) %>%
+        (function(l) sub("_\\d+$", "", l)) %>%
         as.Date()
 
     early <- date %>% unique() %>% sort() %>% head(1)
     sequences[date==early,]
 
 }
 
+# add a consensus sequence.
+# Useful for formatting a fasta file in a form acceptable
+# to the PFitter tool at http://www.hiv.lanl.gov/content/sequence/POISSON_FITTER/pfitter.html
+#
+# Typical usage:
+# 	read.dna( fasta.file, format = "fasta" ) %>%
+#     	    add.consensus( label=basename(file_path_sans_ext(fasta.file))) %>%
+#     	    write.dna('output2.fa', format = "fasta", nbcol=-1, colsep = "",indent = 0,blocksep = 0)
+#
+add.consensus <- function ( in.fasta, label='Consensus' ) {
+    .consensus.mat <- matrix( seqinr::consensus( as.character( in.fasta ) ), nrow = 1 );
+    consensus <- as.DNAbin( .consensus.mat );
+    rownames( consensus ) <- label
+
+    rbind( consensus, in.fasta );
+}
+
+filter.gaps <- function(in.fasta, consensus='Consensus') {
+    # Remove any columns with a consensus that is a gap, which means
+    # that over half of seqs have gaps.  This needs to be removed
+    # because it becomes not sensible to consider poisson rates of
+    # insertions.  We do however consider rates of deletions, treating
+    # them as point mutations (by including the indel counts in the
+    # Hamming distance calculation).
+    in.fasta <- in.fasta[,in.fasta[1,] != '-']
+
+    return(in.fasta)
+}
+
+
 search.source('utils.r')
 search.source('pfitter.r')
 
@@ -40,7 +71,7 @@ parser <- OptionParser(usage='keele [options] <fastafile>') %>%
                help="Include site with gaps in distace measure [default %default].  This effectively adds in a new distance matrix calculated with 'indel' model in dist.dna().",
                dest="include.gaps")
 
-# a <- parse_args(parser, args = c('rv217_anon_1w_44.fasta'), positional_arguments = TRUE)
+# a <- parse_args(parser, args = c('sample_aln.fa'), positional_arguments = TRUE)
 
 a <- parse_args(parser, positional_arguments = TRUE)
 file <- a$args
@@ -57,7 +88,9 @@ if (file.access(file) == -1)
 
 d <- read.dna( file, format = "fasta" ) %>%
     select.early() %>%
-    prep.distances(include.gaps.in.Hamming=opt$include.gaps )
+    add.consensus() %>%
+    filter.gaps() %>%
+    prep.distances(include.gaps.in.Hamming=opt$include.gaps)
 
 r <- pfitter(d$distances, opt$rate, d$seq.length)
 

diff --git a/bin/mkbeast_rv217.py b/bin/mkbeast_rv217.py
@@ -112,6 +112,9 @@ def processFasta(datafile):
             patientId = fields[0]
             timePoint = fields[1] if len(fields) > 0 else "0"
             sampleDate = fields[4] if len(fields) > 3 else "0"
+            # if the sequence name ends with "_<digits>", assume this is a multiplicity indicator
+            # and strip it off before converting the date.
+            sampleDate = re.sub(r"_\d+$", '', sampleDate)
             taxon = record
 
             collectiondate = patient[sampleDate]