Skip to content

Commit

Permalink
New sort command
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Jul 6, 2017
1 parent 0874974 commit 8fe6acd
Show file tree
Hide file tree
Showing 11 changed files with 145 additions and 15 deletions.
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ OBJS = main.o vcfindex.o tabix.o \
vcfcnv.o HMM.o vcfplugin.o consensus.o ploidy.o bin.o hclust.o version.o \
regidx.o smpl_ilist.o csq.o vcfbuf.o \
mpileup.o bam2bcf.o bam2bcf_indel.o bam_sample.o \
vcfsort.o \
ccall.o em.o prob1.o kmin.o # the original samtools calling

prefix = /usr/local
Expand Down Expand Up @@ -202,6 +203,7 @@ vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vc
vcfroh.o: vcfroh.c $(roh_h)
vcfcnv.o: vcfcnv.c $(cnv_h)
vcfsom.o: vcfsom.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h)
vcfsort.o: vcfsort.c $(htslib_vcf_h) $(bcftools_h)
vcfstats.o: vcfstats.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) $(bcftools_h) $(filter_h) $(bin_h)
vcfview.o: vcfview.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h)
reheader.o: reheader.c $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_kseq_h) $(bcftools_h)
Expand Down
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
option has been renamed to `-I, --iupac`, in favor of the standard
`-i, --include`.

* New `sort` command.


## Release 1.5 (June 2017)

Expand Down
42 changes: 39 additions & 3 deletions doc/bcftools.1
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
.\" Title: bcftools
.\" Author: [see the "AUTHORS" section]
.\" Generator: DocBook XSL Stylesheets v1.76.1 <http://docbook.sf.net/>
.\" Date: 2017-07-03 15:52 BST
.\" Date: 2017-07-06 16:16 BST
.\" Manual: \ \&
.\" Source: \ \&
.\" Language: English
.\"
.TH "BCFTOOLS" "1" "2017\-07\-03 15:52 BST" "\ \&" "\ \&"
.TH "BCFTOOLS" "1" "2017\-07\-06 16:16 BST" "\ \&" "\ \&"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
Expand Down Expand Up @@ -41,7 +41,7 @@ Most commands accept VCF, bgzipped VCF and BCF with filetype detected automatica
BCFtools is designed to work on a stream\&. It regards an input file "\-" as the standard input (stdin) and outputs to the standard output (stdout)\&. Several commands can thus be combined with Unix pipes\&.
.SS "VERSION"
.sp
This manual page was last updated \fB2017\-07\-03 15:52 BST\fR and refers to bcftools git version \fB1\&.5\-7\-gfd281df+\fR\&.
This manual page was last updated \fB2017\-07\-06 16:16 BST\fR and refers to bcftools git version \fB1\&.5\-8\-g0874974+\fR\&.
.SS "BCF1"
.sp
The BCF1 format output by versions of samtools <= 0\&.1\&.19 is \fBnot\fR compatible with this version of bcftools\&. To read BCF1 files one can use the view command from old versions of bcftools packaged with samtools versions <= 0\&.1\&.19 to convert to VCF, which can then be read by this version of bcftools\&.
Expand Down Expand Up @@ -318,6 +318,19 @@ For a full list of available commands, run \fBbcftools\fR without arguments\&. F
.IP \(bu 2.3
.\}

\fBsort\fR
\&.\&. sort VCF/BCF files
.RE
.sp
.RS 4
.ie n \{\
\h'-04'\(bu\h'+03'\c
.\}
.el \{\
.sp -1
.IP \(bu 2.3
.\}

\fBstats\fR
\&.\&. produce VCF/BCF stats (former vcfcheck)
.RE
Expand Down Expand Up @@ -3925,6 +3938,29 @@ estimate HMM parameters using Baum\-Welch algorithm, using the convergence thres
\fIFLOAT\fR, e\&.g\&. 1e\-10 (experimental)
.RE
.RE
.SS "bcftools sort [\fIOPTIONS\fR] file\&.bcf"
.PP
\fB\-m, \-\-max\-mem\fR \fIFLOAT\fR[\fIkMG\fR]
.RS 4
Maximum memory to use\&. Approximate, affects the number of temporary files written to the disk\&. Note that if the command fails at this step because of too many open files, your system limit on the number of open files ("ulimit") may need to be increased\&.
.RE
.PP
\fB\-o, \-\-output\fR \fIFILE\fR
.RS 4
see
\fBCommon Options\fR
.RE
.PP
\fB\-O, \-\-output\-type\fR \fIb\fR|\fIu\fR|\fIz\fR|\fIv\fR
.RS 4
see
\fBCommon Options\fR
.RE
.PP
\fB\-T, \-\-temp\-dir\fR \fIDIR\fR
.RS 4
Use this directory to store temporary files
.RE
.SS "bcftools stats [\fIOPTIONS\fR] \fIA\&.vcf\&.gz\fR [\fIB\&.vcf\&.gz\fR]"
.sp
Parses VCF or BCF and produces text file stats which is suitable for machine processing and can be plotted using \fBplot\-vcfstats\fR\&. When two files are given, the program generates separate stats for intersection and the complements\&. By default only sites are compared, \fB\-s\fR/\fB\-S\fR must given to include also sample columns\&. When one VCF file is specified on the command line, then stats by non\-reference allele frequency, depth distribution, stats by quality and per\-sample counts, singleton stats, etc\&. are printed\&. When two VCF files are given, then stats such as concordance (Genotype concordance by non\-reference allele frequency, Genotype concordance by sample, Non\-Reference Discordance) and correlation are also printed\&. Per\-site discordance (PSD) is also printed in \fB\-\-verbose\fR mode\&.
Expand Down
26 changes: 23 additions & 3 deletions doc/bcftools.html
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25035616"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools is a set of utilities that manipulate variant calls in the Variant
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /><title>bcftools</title><link rel="stylesheet" type="text/css" href="docbook-xsl.css" /><meta name="generator" content="DocBook XSL Stylesheets V1.76.1" /></head><body><div xml:lang="en" class="refentry" title="bcftools" lang="en"><a id="idp25198976"></a><div class="titlepage"></div><div class="refnamediv"><h2>Name</h2><p>bcftools — utilities for variant calling and manipulating VCFs and BCFs.</p></div><div class="refsynopsisdiv" title="Synopsis"><a id="_synopsis"></a><h2>Synopsis</h2><p><span class="strong"><strong>bcftools</strong></span> [--version|--version-only] [--help] [<span class="emphasis"><em>COMMAND</em></span>] [<span class="emphasis"><em>OPTIONS</em></span>]</p></div><div class="refsect1" title="DESCRIPTION"><a id="_description"></a><h2>DESCRIPTION</h2><p>BCFtools is a set of utilities that manipulate variant calls in the Variant
Call Format (VCF) and its binary counterpart BCF. All commands work
transparently with both VCFs and BCFs, both uncompressed and BGZF-compressed.</p><p>Most commands accept VCF, bgzipped VCF and BCF with filetype detected
automatically even when streaming from a pipe. Indexed VCF and BCF
will work in all situations. Un-indexed VCF and BCF and streams will
work in most, but not all situations. In general, whenever multiple VCFs are
read simultaneously, they must be indexed and therefore also compressed.</p><p>BCFtools is designed to work on a stream. It regards an input file "-" as the
standard input (stdin) and outputs to the standard output (stdout). Several
commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2017-07-03 15:52 BST</strong></span> and refers to bcftools git version <span class="strong"><strong>1.5-7-gfd281df+</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
commands can thus be combined with Unix pipes.</p><div class="refsect2" title="VERSION"><a id="_version"></a><h3>VERSION</h3><p>This manual page was last updated <span class="strong"><strong>2017-07-06 16:16 BST</strong></span> and refers to bcftools git version <span class="strong"><strong>1.5-8-g0874974+</strong></span>.</p></div><div class="refsect2" title="BCF1"><a id="_bcf1"></a><h3>BCF1</h3><p>The BCF1 format output by versions of samtools &lt;= 0.1.19 is <span class="strong"><strong>not</strong></span>
compatible with this version of bcftools. To read BCF1 files one can use
the view command from old versions of bcftools packaged with samtools
versions &lt;= 0.1.19 to convert to VCF, which can then be read by
Expand Down Expand Up @@ -57,6 +57,8 @@
</li><li class="listitem">
<span class="strong"><strong><a class="link" href="#roh" title="bcftools roh [OPTIONS] file.vcf.gz">roh</a></strong></span> .. identify runs of homo/auto-zygosity
</li><li class="listitem">
<span class="strong"><strong><a class="link" href="#sort" title="bcftools sort [OPTIONS] file.bcf">sort</a></strong></span> .. sort VCF/BCF files
</li><li class="listitem">
<span class="strong"><strong><a class="link" href="#stats" title="bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]">stats</a></strong></span> .. produce VCF/BCF stats (former vcfcheck)
</li><li class="listitem">
<span class="strong"><strong><a class="link" href="#view" title="bcftools view [OPTIONS] file.vcf.gz [REGION […]]">view</a></strong></span> .. subset, filter and convert VCF and BCF files
Expand Down Expand Up @@ -2344,7 +2346,25 @@
</span></dt><dd>
estimate HMM parameters using Baum-Welch algorithm, using the convergence threshold
<span class="emphasis"><em>FLOAT</em></span>, e.g. 1e-10 (experimental)
</dd></dl></div></div></div><div class="refsect2" title="bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]"><a id="stats"></a><h3>bcftools stats [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>A.vcf.gz</em></span> [<span class="emphasis"><em>B.vcf.gz</em></span>]</h3><p>Parses VCF or BCF and produces text file stats which is suitable for machine
</dd></dl></div></div></div><div class="refsect2" title="bcftools sort [OPTIONS] file.bcf"><a id="sort"></a><h3>bcftools sort [<span class="emphasis"><em>OPTIONS</em></span>] file.bcf</h3><div class="variablelist"><dl><dt><span class="term">
<span class="strong"><strong>-m, --max-mem</strong></span> <span class="emphasis"><em>FLOAT</em></span>[<span class="emphasis"><em>kMG</em></span>]
</span></dt><dd>
Maximum memory to use. Approximate, affects the number of temporary files written
to the disk. Note that if the command fails at this step because of too many open files,
your system limit on the number of open files ("ulimit") may need to be increased.
</dd><dt><span class="term">
<span class="strong"><strong>-o, --output</strong></span> <span class="emphasis"><em>FILE</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
</dd><dt><span class="term">
<span class="strong"><strong>-O, --output-type</strong></span> <span class="emphasis"><em>b</em></span>|<span class="emphasis"><em>u</em></span>|<span class="emphasis"><em>z</em></span>|<span class="emphasis"><em>v</em></span>
</span></dt><dd>
see <span class="strong"><strong><a class="link" href="#common_options" title="Common Options">Common Options</a></strong></span>
</dd><dt><span class="term">
<span class="strong"><strong>-T, --temp-dir</strong></span> <span class="emphasis"><em>DIR</em></span>
</span></dt><dd>
Use this directory to store temporary files
</dd></dl></div></div><div class="refsect2" title="bcftools stats [OPTIONS] A.vcf.gz [B.vcf.gz]"><a id="stats"></a><h3>bcftools stats [<span class="emphasis"><em>OPTIONS</em></span>] <span class="emphasis"><em>A.vcf.gz</em></span> [<span class="emphasis"><em>B.vcf.gz</em></span>]</h3><p>Parses VCF or BCF and produces text file stats which is suitable for machine
processing and can be plotted using <span class="strong"><strong><a class="link" href="#plot-vcfstats" title="plot-vcfstats [OPTIONS] file.vchk […]">plot-vcfstats</a></strong></span>. When two files are given,
the program generates separate stats for intersection and the complements. By
default only sites are compared, <span class="strong"><strong>-s</strong></span>/<span class="strong"><strong>-S</strong></span> must given to include also sample
Expand Down
19 changes: 19 additions & 0 deletions doc/bcftools.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ list of available options, run *bcftools* 'COMMAND' without arguments.
- *<<query,query>>* .. transform VCF/BCF into user-defined formats
- *<<reheader,reheader>>* .. modify VCF/BCF header, change sample names
- *<<roh,roh>>* .. identify runs of homo/auto-zygosity
- *<<sort,sort>>* .. sort VCF/BCF files
- *<<stats,stats>>* .. produce VCF/BCF stats (former vcfcheck)
- *<<view,view>>* .. subset, filter and convert VCF and BCF files

Expand Down Expand Up @@ -2363,6 +2364,24 @@ Transition probabilities:
'FLOAT', e.g. 1e-10 (experimental)


[[sort]]
=== bcftools sort ['OPTIONS'] file.bcf

*-m, --max-mem* 'FLOAT'['kMG']::
Maximum memory to use. Approximate, affects the number of temporary files written
to the disk. Note that if the command fails at this step because of too many open files,
your system limit on the number of open files ("ulimit") may need to be increased.

*-o, --output* 'FILE'::
see *<<common_options,Common Options>>*

*-O, --output-type* 'b'|'u'|'z'|'v'::
see *<<common_options,Common Options>>*

*-T, --temp-dir* 'DIR'::
Use this directory to store temporary files



[[stats]]
=== bcftools stats ['OPTIONS'] 'A.vcf.gz' ['B.vcf.gz']
Expand Down
5 changes: 4 additions & 1 deletion kheap.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@
// "data_t".
heap_t *heap = khp_init(mh);
// When inserting a new element, the heap stores a copy of the memory
// area pointed to by the third argument.
for (int i=0; i<3; i++)
khp_insert(mh, heap, &data[i]);
Expand Down Expand Up @@ -130,7 +132,8 @@
{ \
heap->mdat = heap->ndat; \
kroundup32(heap->mdat); \
heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t)); \
heap->dat = (kheap_t*)realloc(heap->dat, heap->mdat*sizeof(kheap_t)); \
memset(heap->dat + heap->ndat, 0, (heap->mdat - heap->ndat)*sizeof(kheap_t)); \
} \
int i = heap->ndat - 1; \
while ( i && __cmp(dat,&heap->dat[khp_parent(i)]) ) \
Expand Down
5 changes: 5 additions & 0 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ int main_plugin(int argc, char *argv[]);
int main_consensus(int argc, char *argv[]);
int main_csq(int argc, char *argv[]);
int bam_mpileup(int argc, char *argv[]);
int main_sort(int argc, char *argv[]);

typedef struct
{
Expand Down Expand Up @@ -126,6 +127,10 @@ static cmd_t cmds[] =
.alias = "reheader",
.help = "modify VCF/BCF header, change sample names"
},
{ .func = main_sort,
.alias = "sort",
.help = "sort VCF/BCF file"
},
{ .func = main_vcfview,
.alias = "view",
.help = "VCF/BCF conversion, view, subset and filter VCF/BCF files"
Expand Down
15 changes: 15 additions & 0 deletions test/sort.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
1 101
1 102
1 103
1 104
1 105
2 101
2 102
2 103
2 104
2 105
3 101
3 102
3 103
3 104
3 105
21 changes: 21 additions & 0 deletions test/sort.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##fileformat=VCFv4.2
##reference=file:///ref.fa
##contig=<ID=1,length=2147483647>
##contig=<ID=2,length=2147483647>
##contig=<ID=3,length=2147483647>
#CHROM POS ID REF ALT QUAL FILTER INFO
3 105 . T C . . .
3 104 . T C . . .
3 103 . T C . . .
3 102 . T C . . .
3 101 . T C . . .
2 105 . T C . . .
2 104 . T C . . .
2 103 . T C . . .
2 102 . T C . . .
2 101 . T C . . .
1 105 . T C . . .
1 104 . T C . . .
1 103 . T C . . .
1 102 . T C . . .
1 101 . T C . . .
10 changes: 9 additions & 1 deletion test/test.pl
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,8 @@
test_vcf_filter($opts,in=>'filter.2',out=>'filter.17.out',args=>q[-i'GT="HOM"'],fmt=>'%POS[\\t%GT]\\n');
test_vcf_filter($opts,in=>'filter.2',out=>'filter.18.out',args=>q[-i'GT="HET"'],fmt=>'%POS[\\t%GT]\\n');
test_vcf_filter($opts,in=>'filter.2',out=>'filter.19.out',args=>q[-i'GT="HAP"'],fmt=>'%POS[\\t%GT]\\n');
test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 0],fmt=>'%CHROM\\t%POS\\n');
test_vcf_sort($opts,in=>'sort',out=>'sort.out',args=>q[-m 1000],fmt=>'%CHROM\\t%POS\\n');
test_vcf_regions($opts,in=>'regions');
test_vcf_annotate($opts,in=>'annotate',tab=>'annotate',out=>'annotate.out',args=>'-c CHROM,POS,REF,ALT,ID,QUAL,INFO/T_INT,INFO/T_FLOAT,INDEL');
test_vcf_annotate($opts,in=>'annotate',tab=>'annotate2',out=>'annotate2.out',args=>'-c CHROM,FROM,TO,T_STR');
Expand Down Expand Up @@ -713,6 +715,13 @@ sub test_vcf_filter
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools filter $args{args} $$opts{path}/$args{in}.vcf | $pipe");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools filter -Ob $args{args} $$opts{path}/$args{in}.vcf | $$opts{bin}/bcftools view | $pipe");
}
sub test_vcf_sort
{
my ($opts,%args) = @_;
my $pipe = "$$opts{bin}/bcftools query -f '$args{fmt}'";
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools sort $args{args} $$opts{path}/$args{in}.vcf | $pipe");
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools sort -Ob $args{args} $$opts{path}/$args{in}.vcf | $$opts{bin}/bcftools view | $pipe");
}
sub test_vcf_regions
{
my ($opts,%args) = @_;
Expand Down Expand Up @@ -1098,4 +1107,3 @@ sub test_csq_real
}
closedir($dh);
}
13 changes: 6 additions & 7 deletions vcfisec.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,12 @@ void mkdir_p(const char *fmt, ...)
while (*p)
{
while (*p && *p!='/') p++;
if ( *p )
{
*p = 0;
mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
*p = '/';
p++;
}
char ctmp = *p;
*p = 0;
int ret = mkdir(tmp,S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
if ( ret!=0 && errno!=EEXIST ) error("Error creating directory %s: %s\n", path,strerror(errno));
*p = ctmp;
p++;
}
free(tmp);
free(path);
Expand Down

0 comments on commit 8fe6acd

Please sign in to comment.