-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
426 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from bloom.filter_seqs_from_biom import remove_seqs, trim_seqs | ||
|
||
|
||
__all__ = ['remove_seqs', 'trim_seqs'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
import biom | ||
import argparse | ||
import sys | ||
|
||
|
||
def trim_seqs(seqs, length = 100): | ||
""" | ||
Trims the sequences to a given length | ||
Parameters | ||
---------- | ||
seqs: generator of skbio.Sequence objects | ||
Returns | ||
------- | ||
generator of skbio.Sequence objects | ||
trimmed sequences | ||
""" | ||
for seq in seqs: | ||
yield seq[:length] | ||
|
||
def remove_seqs(table, seqs): | ||
""" | ||
Parameters | ||
---------- | ||
table : biom.Table | ||
Input biom table | ||
seqs : generator, skbio.Sequence | ||
Iterator of sequence objects to be removed from the biom table. | ||
Return | ||
------ | ||
biom.Table | ||
""" | ||
filter_seqs = set([str(s) for s in seqs]) | ||
bloom_filter = lambda v, i, m: i in filter_seqs | ||
table.filter(bloom_filter, axis='observation') | ||
return table | ||
|
||
|
||
def main(argv): | ||
parser=argparse.ArgumentParser( | ||
description='Filter sequences from biom table using a fasta file. Version ' + \ | ||
__version__) | ||
parser.add_argument('-i','--inputtable', | ||
help='input biom table file name') | ||
parser.add_argument('-o','--output', | ||
help='output biom file name') | ||
parser.add_argument('-f','--fasta', | ||
help='fitering fasta file name') | ||
parser.add_argument('--ignore-table-seq-length', | ||
help="don't trim fasta file sequences to table read length", | ||
action='store_true') | ||
|
||
args=parser.parse_args(argv) | ||
|
||
seqs = skbio.read(args.fasta, format='fasta') | ||
table = load_table(args.inputtable) | ||
length = min(map(len, table.ids(axis='observation'))) | ||
if not parse.ignore_table_seq_length: | ||
seqs = trim_seqs(seqs, length=length) | ||
|
||
outtable = remove(table, seqs) | ||
|
||
with biom.util.biom_open(args.output, 'w') as f: | ||
outtable.to_hdf5(f, "filterbiomseqs") | ||
|
||
|
||
if __name__ == "__main__": | ||
main(sys.argv[1:]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from biom import Table | ||
from bloom.filter_seqs_from_biom import remove_seqs, trim_seqs | ||
import skbio | ||
import unittest | ||
import numpy as np | ||
import numpy.testing as npt | ||
|
||
class TestFilter(unittest.TestCase): | ||
def setUp(self): | ||
self.seqs = (skbio.Sequence('AACCGGTT'), | ||
skbio.Sequence('AACCGAGG'), | ||
skbio.Sequence('AACCTTTT'), | ||
skbio.Sequence('AACCGCTC')) | ||
self.table = Table( | ||
np.array([ | ||
[0, 1, 1], | ||
[0, 2, 1], | ||
[1, 0, 1], | ||
[0, 0, 1], | ||
[9, 1, 1]]), | ||
['AACCGG', | ||
'AACCGA', | ||
'AACCTT', | ||
'AACCGC', | ||
'AAAAAA'], | ||
['s1', 's2', 's3']) | ||
|
||
def test_trim_seqs(self): | ||
seqs = trim_seqs(self.seqs, length=6) | ||
exp = [skbio.Sequence('AACCGG'), | ||
skbio.Sequence('AACCGA'), | ||
skbio.Sequence('AACCTT'), | ||
skbio.Sequence('AACCGC')] | ||
self.assertEqual(list(seqs), exp) | ||
|
||
def test_remove_seqs(self): | ||
seqs = trim_seqs(self.seqs, length=6) | ||
res = remove(self.table, seqs) | ||
exp = Table( | ||
np.array([ | ||
[0, 1, 1], | ||
[0, 2, 1], | ||
[1, 0, 1], | ||
[0, 0, 1]]), | ||
['AACCGG', 'AACCGA', | ||
'AACCTT', 'AACCGC'], | ||
['s1', 's2', 's3']) | ||
npt.assert_array_equal(np.array(exp.matrix_data.todense()), | ||
np.array(res.matrix_data.todense())) | ||
|
||
|
||
if __name__=='__main__': | ||
unittest.main() |
Binary file not shown.
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"Here we will " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import skbio\n", | ||
"from biom import Table, load_table\n", | ||
"from biom.util import biom_open\n", | ||
"from bloom import remove_seqs, trim_seqs" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"data_dir = '../data/'\n", | ||
"results_dir = '../results'\n", | ||
"zip_file = '30_seqs.fna.gz'\n", | ||
"seqs_file = '30_seqs.fna'\n", | ||
"biom_file = 'ercolini.feces.clean.withtax.biom'\n", | ||
"filtered_file = 'filtered.fna'\n", | ||
"\n", | ||
"zip_file = os.path.join(data_dir, zip_file)\n", | ||
"seqs_file = os.path.join(data_dir, seqs_file)\n", | ||
"biom_file = os.path.join(data_dir, biom_file)\n", | ||
"filtered_file = os.path.join(results_dir, filtered_file)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"gunzip: can't stat: ../data/30_seqs.fna.gz (../data/30_seqs.fna.gz.gz): No such file or directory\r\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"!gunzip ../data/30_seqs.fna.gz" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"seqs = skbio.read(seqs_file, format='fasta')\n", | ||
"table = load_table(biom_file)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"length = min(map(len, table.ids(axis='observation')))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"seqs = trim_seqs(seqs, length=length)\n", | ||
"outtable = remove_seqs(table, seqs)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"with biom_open(filtered_file, 'w') as f:\n", | ||
" outtable.to_hdf5(f, \"filterbiomseqs\")\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"collapsed": true | ||
}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.5.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#!/usr/bin/env python | ||
|
||
# ---------------------------------------------------------------------------- | ||
# Copyright (c) 2013--, scikit-bio development team. | ||
# | ||
# Distributed under the terms of the Modified BSD License. | ||
# | ||
# The full license is in the file COPYING.txt, distributed with this software. | ||
# ---------------------------------------------------------------------------- | ||
|
||
import os | ||
import platform | ||
import re | ||
import ast | ||
import sys | ||
|
||
from setuptools import find_packages, setup | ||
from setuptools.extension import Extension | ||
|
||
import numpy as np | ||
|
||
classes = """ | ||
Development Status :: 1 - Pre-Alpha | ||
License :: OSI Approved :: BSD License | ||
Topic :: Software Development :: Libraries | ||
Topic :: Scientific/Engineering | ||
Topic :: Scientific/Engineering :: Bio-Informatics | ||
Programming Language :: Python :: 3 | ||
Programming Language :: Python :: 3 :: Only | ||
Programming Language :: Python :: 3.4 | ||
Programming Language :: Python :: 3.5 | ||
Operating System :: Unix | ||
Operating System :: POSIX | ||
Operating System :: MacOS :: MacOS X | ||
""" | ||
classifiers = [s.strip() for s in classes.split('\n') if s] | ||
|
||
setup(name='bloom_analyses', | ||
version='1.0', | ||
license='BSD-3-Clause', | ||
description='The Bloom Filter', | ||
long_description='The Bloom Filter', | ||
author="bloom development team", | ||
author_email="[email protected]", | ||
maintainer="bloom development team", | ||
maintainer_email="[email protected]", | ||
packages=find_packages(), | ||
include_dirs=[np.get_include()], | ||
install_requires=[ | ||
'scikit-bio', 'numpy', 'biom-format' | ||
], | ||
classifiers=classifiers, | ||
package_data={ | ||
} | ||
) |