Adding initial analyses

knightlab-analyses · Oct 25, 2016 · e44ffd0 · e44ffd0
1 parent 9cf0153
commit e44ffd0
Show file tree

Hide file tree

Showing 9 changed files with 426 additions and 0 deletions.
diff --git a/bloom/__init__.py b/bloom/__init__.py
@@ -0,0 +1,4 @@
+from bloom.filter_seqs_from_biom import remove_seqs, trim_seqs
+
+
+__all__ = ['remove_seqs', 'trim_seqs']
diff --git a/bloom/filter_seqs_from_biom.py b/bloom/filter_seqs_from_biom.py
@@ -0,0 +1,70 @@
+import biom
+import argparse
+import sys
+
+
+def trim_seqs(seqs, length = 100):
+    """
+    Trims the sequences to a given length
+
+    Parameters
+    ----------
+    seqs: generator of skbio.Sequence objects
+
+    Returns
+    -------
+    generator of skbio.Sequence objects
+        trimmed sequences
+    """
+    for seq in seqs:
+        yield seq[:length]
+
+def remove_seqs(table, seqs):
+    """
+    Parameters
+    ----------
+    table : biom.Table
+       Input biom table
+    seqs : generator, skbio.Sequence
+       Iterator of sequence objects to be removed from the biom table.
+
+    Return
+    ------
+    biom.Table
+    """
+    filter_seqs = set([str(s) for s in seqs])
+    bloom_filter = lambda v, i, m: i in filter_seqs
+    table.filter(bloom_filter, axis='observation')
+    return table
+
+
+def main(argv):
+        parser=argparse.ArgumentParser(
+            description='Filter sequences from biom table using a fasta file. Version ' + \
+            __version__)
+        parser.add_argument('-i','--inputtable',
+                            help='input biom table file name')
+        parser.add_argument('-o','--output',
+                            help='output biom file name')
+        parser.add_argument('-f','--fasta',
+                            help='fitering fasta file name')
+        parser.add_argument('--ignore-table-seq-length',
+                            help="don't trim fasta file sequences to table read length",
+                            action='store_true')
+
+        args=parser.parse_args(argv)
+
+        seqs = skbio.read(args.fasta, format='fasta')
+        table = load_table(args.inputtable)
+        length = min(map(len, table.ids(axis='observation')))
+        if not parse.ignore_table_seq_length:
+            seqs = trim_seqs(seqs, length=length)
+
+        outtable = remove(table, seqs)
+
+        with biom.util.biom_open(args.output, 'w') as f:
+                outtable.to_hdf5(f, "filterbiomseqs")
+
+
+if __name__ == "__main__":
+        main(sys.argv[1:])
diff --git a/bloom/test_filter.py b/bloom/test_filter.py
@@ -0,0 +1,53 @@
+from biom import Table
+from bloom.filter_seqs_from_biom import remove_seqs, trim_seqs
+import skbio
+import unittest
+import numpy as np
+import numpy.testing as npt
+
+class TestFilter(unittest.TestCase):
+    def setUp(self):
+        self.seqs = (skbio.Sequence('AACCGGTT'),
+                     skbio.Sequence('AACCGAGG'),
+                     skbio.Sequence('AACCTTTT'),
+                     skbio.Sequence('AACCGCTC'))
+        self.table = Table(
+            np.array([
+                [0, 1, 1],
+                [0, 2, 1],
+                [1, 0, 1],
+                [0, 0, 1],
+                [9, 1, 1]]),
+            ['AACCGG',
+             'AACCGA',
+             'AACCTT',
+             'AACCGC',
+             'AAAAAA'],
+            ['s1', 's2', 's3'])
+
+    def test_trim_seqs(self):
+        seqs = trim_seqs(self.seqs, length=6)
+        exp = [skbio.Sequence('AACCGG'),
+               skbio.Sequence('AACCGA'),
+               skbio.Sequence('AACCTT'),
+               skbio.Sequence('AACCGC')]
+        self.assertEqual(list(seqs), exp)
+
+    def test_remove_seqs(self):
+        seqs = trim_seqs(self.seqs, length=6)
+        res = remove(self.table, seqs)
+        exp = Table(
+            np.array([
+                [0, 1, 1],
+                [0, 2, 1],
+                [1, 0, 1],
+                [0, 0, 1]]),
+            ['AACCGG', 'AACCGA',
+             'AACCTT', 'AACCGC'],
+            ['s1', 's2', 's3'])
+        npt.assert_array_equal(np.array(exp.matrix_data.todense()),
+                               np.array(res.matrix_data.todense()))
+
+
+if __name__=='__main__':
+    unittest.main()
diff --git a/data/30_seqs.fna.gz b/data/30_seqs.fna.gz
diff --git a/data/ercolini.feces.clean.withtax.biom b/data/ercolini.feces.clean.withtax.biom
diff --git a/data/map.ercolini.txt b/data/map.ercolini.txt
diff --git a/ipynb/bloom_example.ipynb b/ipynb/bloom_example.ipynb
@@ -0,0 +1,147 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "Here we will "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import skbio\n",
+    "from biom import Table, load_table\n",
+    "from biom.util import biom_open\n",
+    "from bloom import remove_seqs, trim_seqs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "data_dir = '../data/'\n",
+    "results_dir = '../results'\n",
+    "zip_file = '30_seqs.fna.gz'\n",
+    "seqs_file = '30_seqs.fna'\n",
+    "biom_file = 'ercolini.feces.clean.withtax.biom'\n",
+    "filtered_file = 'filtered.fna'\n",
+    "\n",
+    "zip_file = os.path.join(data_dir, zip_file)\n",
+    "seqs_file = os.path.join(data_dir, seqs_file)\n",
+    "biom_file = os.path.join(data_dir, biom_file)\n",
+    "filtered_file = os.path.join(results_dir, filtered_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "gunzip: can't stat: ../data/30_seqs.fna.gz (../data/30_seqs.fna.gz.gz): No such file or directory\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "!gunzip ../data/30_seqs.fna.gz"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "seqs = skbio.read(seqs_file, format='fasta')\n",
+    "table = load_table(biom_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "length = min(map(len, table.ids(axis='observation')))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "seqs = trim_seqs(seqs, length=length)\n",
+    "outtable = remove_seqs(table, seqs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "with biom_open(filtered_file, 'w') as f:\n",
+    "        outtable.to_hdf5(f, \"filterbiomseqs\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/results/__init__.py b/results/__init__.py
diff --git a/setup.py b/setup.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+
+# ----------------------------------------------------------------------------
+# Copyright (c) 2013--, scikit-bio development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file COPYING.txt, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import os
+import platform
+import re
+import ast
+import sys
+
+from setuptools import find_packages, setup
+from setuptools.extension import Extension
+
+import numpy as np
+
+classes = """
+    Development Status :: 1 - Pre-Alpha
+    License :: OSI Approved :: BSD License
+    Topic :: Software Development :: Libraries
+    Topic :: Scientific/Engineering
+    Topic :: Scientific/Engineering :: Bio-Informatics
+    Programming Language :: Python :: 3
+    Programming Language :: Python :: 3 :: Only
+    Programming Language :: Python :: 3.4
+    Programming Language :: Python :: 3.5
+    Operating System :: Unix
+    Operating System :: POSIX
+    Operating System :: MacOS :: MacOS X
+"""
+classifiers = [s.strip() for s in classes.split('\n') if s]
+
+setup(name='bloom_analyses',
+      version='1.0',
+      license='BSD-3-Clause',
+      description='The Bloom Filter',
+      long_description='The Bloom Filter',
+      author="bloom development team",
+      author_email="[email protected]",
+      maintainer="bloom development team",
+      maintainer_email="[email protected]",
+      packages=find_packages(),
+      include_dirs=[np.get_include()],
+      install_requires=[
+          'scikit-bio', 'numpy', 'biom-format'
+      ],
+      classifiers=classifiers,
+      package_data={
+          }
+      )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,4 @@
		from bloom.filter_seqs_from_biom import remove_seqs, trim_seqs


		__all__ = ['remove_seqs', 'trim_seqs']