Skip to content

Commit

Permalink
Added explicit check that data is strict hierarchy
Browse files Browse the repository at this point in the history
  • Loading branch information
donovan-h-parks committed Jul 31, 2014
1 parent f03afbb commit 17158af
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 4 deletions.
45 changes: 43 additions & 2 deletions stamp/metagenomics/fileIO/StampIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#=======================================================================

import string
from collections import defaultdict

from stamp.metagenomics.ProfileTree import ProfileTree, Node
from stamp.metagenomics.StringHelper import isNumber
Expand All @@ -36,7 +37,7 @@ def read(self, filename):
fin = open(filename, 'U')
data = map(string.strip, fin.readlines())
fin.close()

profileTree = ProfileTree()

# determine number of hierarchical levels and samples
Expand All @@ -50,6 +51,11 @@ def read(self, filename):
errMsg = 'Profile file must contain a column indicating feature names.'
return None, errMsg

# verify data forms a strict hierarchy
errMsg = self.checkHierarchy(data, profileTree.numHierarchicalLevels())
if errMsg != None:
return None, errMsg

# construct profile tree
try:
profileTree.numSeqInSample = [0] * profileTree.numSamples()
Expand All @@ -67,7 +73,7 @@ def read(self, filename):
# check for unclassified categories
taxa = ''
for j in xrange(0, len(categories)):
if categories[j].lower() == 'unclassified':
if self.isUnclassified(categories[j]):
categories[j] = 'Unclassified ' + taxa
categories[j] = categories[j].rstrip()
else:
Expand Down Expand Up @@ -95,6 +101,14 @@ def read(self, filename):
errMsg = 'Failed to correctly parse line: ' + str(i+1)

return profileTree, errMsg

def isUnclassified(self, value):
"""Check if value (taxon, metabolic pathway) is unclassified."""

# currently unclassified sequences need to be explicitly stated as
# 'unclassified' (case insensitive) or '*__unclassified' which is
# the format used by GreenGenes
return value.lower() == 'unclassified' or value.lower()[1:] == '__unclassified'

def determineColumns(self, data, profileTree):
firstDataRow = data[1].split('\t')
Expand All @@ -111,3 +125,30 @@ def determineColumns(self, data, profileTree):
headings = map(string.strip, headings)
profileTree.hierarchyHeadings = headings[0:firstSampleIndex]
profileTree.sampleNames = headings[firstSampleIndex:]

def checkHierarchy(self, data, numHierarchicalLevels):
"""Verify that data forms a strict hierarchy."""
parent = defaultdict(dict)
for line in data:
lineSplit = line.split('\t')
lineSplit = map(string.strip, lineSplit)

categories = lineSplit[0:numHierarchicalLevels]
for r, value in enumerate(categories):
if r == 0:
continue # top of hierarchy has no parent

if self.isUnclassified(value):
continue # ignore unclassified sequences

if r not in parent:
parent[r] = {}

if value not in parent[r]:
parent[r][value] = categories[r-1]
else:
if parent[r][value] != categories[r-1]:
# data is not a strict hierarchy
return "Data does not form a strict hierarchy. Child %s has multiple parents (e.g., %s, %s)." % (value, parent[r][value], categories[r-1])
return None

4 changes: 2 additions & 2 deletions windows/STAMP_Setup.iss
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
; (To generate a new GUID, click Tools | Generate GUID inside the IDE.)
AppId={{058AA4B2-FB5A-4D89-9134-D6E06AB9E894}
AppName=STAMP
AppVerName=STAMP v2.0.6
AppVerName=STAMP v2.0.7
AppPublisher=Donovan Parks and Robert Beiko
AppPublisherURL=http://kiwi.cs.dal.ca/Software/STAMP
AppSupportURL=http://kiwi.cs.dal.ca/Software/STAMP
Expand All @@ -18,7 +18,7 @@ AllowNoIcons=yes
LicenseFile=..\dist\LICENSE.txt
InfoBeforeFile=..\dist\readme.txt
OutputDir=.\install
OutputBaseFilename=STAMP_2_0_6
OutputBaseFilename=STAMP_2_0_7
SetupIconFile=..\dist\icons\stamp.ico
Compression=lzma
SolidCompression=yes
Expand Down

0 comments on commit 17158af

Please sign in to comment.