Skip to content

Commit

Permalink
Adding initial analyses
Browse files Browse the repository at this point in the history
  • Loading branch information
mortonjt committed Oct 25, 2016
1 parent 9cf0153 commit e44ffd0
Show file tree
Hide file tree
Showing 9 changed files with 426 additions and 0 deletions.
4 changes: 4 additions & 0 deletions bloom/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from bloom.filter_seqs_from_biom import remove_seqs, trim_seqs


__all__ = ['remove_seqs', 'trim_seqs']
70 changes: 70 additions & 0 deletions bloom/filter_seqs_from_biom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import biom
import argparse
import sys


def trim_seqs(seqs, length = 100):
"""
Trims the sequences to a given length
Parameters
----------
seqs: generator of skbio.Sequence objects
Returns
-------
generator of skbio.Sequence objects
trimmed sequences
"""
for seq in seqs:
yield seq[:length]

def remove_seqs(table, seqs):
"""
Parameters
----------
table : biom.Table
Input biom table
seqs : generator, skbio.Sequence
Iterator of sequence objects to be removed from the biom table.
Return
------
biom.Table
"""
filter_seqs = set([str(s) for s in seqs])
bloom_filter = lambda v, i, m: i in filter_seqs
table.filter(bloom_filter, axis='observation')
return table


def main(argv):
parser=argparse.ArgumentParser(
description='Filter sequences from biom table using a fasta file. Version ' + \
__version__)
parser.add_argument('-i','--inputtable',
help='input biom table file name')
parser.add_argument('-o','--output',
help='output biom file name')
parser.add_argument('-f','--fasta',
help='fitering fasta file name')
parser.add_argument('--ignore-table-seq-length',
help="don't trim fasta file sequences to table read length",
action='store_true')

args=parser.parse_args(argv)

seqs = skbio.read(args.fasta, format='fasta')
table = load_table(args.inputtable)
length = min(map(len, table.ids(axis='observation')))
if not parse.ignore_table_seq_length:
seqs = trim_seqs(seqs, length=length)

outtable = remove(table, seqs)

with biom.util.biom_open(args.output, 'w') as f:
outtable.to_hdf5(f, "filterbiomseqs")


if __name__ == "__main__":
main(sys.argv[1:])
53 changes: 53 additions & 0 deletions bloom/test_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from biom import Table
from bloom.filter_seqs_from_biom import remove_seqs, trim_seqs
import skbio
import unittest
import numpy as np
import numpy.testing as npt

class TestFilter(unittest.TestCase):
def setUp(self):
self.seqs = (skbio.Sequence('AACCGGTT'),
skbio.Sequence('AACCGAGG'),
skbio.Sequence('AACCTTTT'),
skbio.Sequence('AACCGCTC'))
self.table = Table(
np.array([
[0, 1, 1],
[0, 2, 1],
[1, 0, 1],
[0, 0, 1],
[9, 1, 1]]),
['AACCGG',
'AACCGA',
'AACCTT',
'AACCGC',
'AAAAAA'],
['s1', 's2', 's3'])

def test_trim_seqs(self):
seqs = trim_seqs(self.seqs, length=6)
exp = [skbio.Sequence('AACCGG'),
skbio.Sequence('AACCGA'),
skbio.Sequence('AACCTT'),
skbio.Sequence('AACCGC')]
self.assertEqual(list(seqs), exp)

def test_remove_seqs(self):
seqs = trim_seqs(self.seqs, length=6)
res = remove(self.table, seqs)
exp = Table(
np.array([
[0, 1, 1],
[0, 2, 1],
[1, 0, 1],
[0, 0, 1]]),
['AACCGG', 'AACCGA',
'AACCTT', 'AACCGC'],
['s1', 's2', 's3'])
npt.assert_array_equal(np.array(exp.matrix_data.todense()),
np.array(res.matrix_data.todense()))


if __name__=='__main__':
unittest.main()
Binary file added data/30_seqs.fna.gz
Binary file not shown.
Binary file added data/ercolini.feces.clean.withtax.biom
Binary file not shown.
97 changes: 97 additions & 0 deletions data/map.ercolini.txt

Large diffs are not rendered by default.

147 changes: 147 additions & 0 deletions ipynb/bloom_example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Here we will "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import os\n",
"import skbio\n",
"from biom import Table, load_table\n",
"from biom.util import biom_open\n",
"from bloom import remove_seqs, trim_seqs"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data_dir = '../data/'\n",
"results_dir = '../results'\n",
"zip_file = '30_seqs.fna.gz'\n",
"seqs_file = '30_seqs.fna'\n",
"biom_file = 'ercolini.feces.clean.withtax.biom'\n",
"filtered_file = 'filtered.fna'\n",
"\n",
"zip_file = os.path.join(data_dir, zip_file)\n",
"seqs_file = os.path.join(data_dir, seqs_file)\n",
"biom_file = os.path.join(data_dir, biom_file)\n",
"filtered_file = os.path.join(results_dir, filtered_file)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"gunzip: can't stat: ../data/30_seqs.fna.gz (../data/30_seqs.fna.gz.gz): No such file or directory\r\n"
]
}
],
"source": [
"!gunzip ../data/30_seqs.fna.gz"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"seqs = skbio.read(seqs_file, format='fasta')\n",
"table = load_table(biom_file)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"length = min(map(len, table.ids(axis='observation')))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"seqs = trim_seqs(seqs, length=length)\n",
"outtable = remove_seqs(table, seqs)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with biom_open(filtered_file, 'w') as f:\n",
" outtable.to_hdf5(f, \"filterbiomseqs\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Empty file added results/__init__.py
Empty file.
55 changes: 55 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#!/usr/bin/env python

# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import os
import platform
import re
import ast
import sys

from setuptools import find_packages, setup
from setuptools.extension import Extension

import numpy as np

classes = """
Development Status :: 1 - Pre-Alpha
License :: OSI Approved :: BSD License
Topic :: Software Development :: Libraries
Topic :: Scientific/Engineering
Topic :: Scientific/Engineering :: Bio-Informatics
Programming Language :: Python :: 3
Programming Language :: Python :: 3 :: Only
Programming Language :: Python :: 3.4
Programming Language :: Python :: 3.5
Operating System :: Unix
Operating System :: POSIX
Operating System :: MacOS :: MacOS X
"""
classifiers = [s.strip() for s in classes.split('\n') if s]

setup(name='bloom_analyses',
version='1.0',
license='BSD-3-Clause',
description='The Bloom Filter',
long_description='The Bloom Filter',
author="bloom development team",
author_email="[email protected]",
maintainer="bloom development team",
maintainer_email="[email protected]",
packages=find_packages(),
include_dirs=[np.get_include()],
install_requires=[
'scikit-bio', 'numpy', 'biom-format'
],
classifiers=classifiers,
package_data={
}
)

0 comments on commit e44ffd0

Please sign in to comment.