From 17158af7c9a75db303c8010c233356c2316bdd2f Mon Sep 17 00:00:00 2001 From: Donovan Parks Date: Thu, 31 Jul 2014 11:03:57 +1000 Subject: [PATCH] Added explicit check that data is strict hierarchy --- stamp/metagenomics/fileIO/StampIO.py | 45 ++++++++++++++++++++++++++-- windows/STAMP_Setup.iss | 4 +-- 2 files changed, 45 insertions(+), 4 deletions(-) diff --git a/stamp/metagenomics/fileIO/StampIO.py b/stamp/metagenomics/fileIO/StampIO.py index 318eae5..09d9584 100644 --- a/stamp/metagenomics/fileIO/StampIO.py +++ b/stamp/metagenomics/fileIO/StampIO.py @@ -22,6 +22,7 @@ #======================================================================= import string +from collections import defaultdict from stamp.metagenomics.ProfileTree import ProfileTree, Node from stamp.metagenomics.StringHelper import isNumber @@ -36,7 +37,7 @@ def read(self, filename): fin = open(filename, 'U') data = map(string.strip, fin.readlines()) fin.close() - + profileTree = ProfileTree() # determine number of hierarchical levels and samples @@ -50,6 +51,11 @@ def read(self, filename): errMsg = 'Profile file must contain a column indicating feature names.' return None, errMsg + # verify data forms a strict hierarchy + errMsg = self.checkHierarchy(data, profileTree.numHierarchicalLevels()) + if errMsg != None: + return None, errMsg + # construct profile tree try: profileTree.numSeqInSample = [0] * profileTree.numSamples() @@ -67,7 +73,7 @@ def read(self, filename): # check for unclassified categories taxa = '' for j in xrange(0, len(categories)): - if categories[j].lower() == 'unclassified': + if self.isUnclassified(categories[j]): categories[j] = 'Unclassified ' + taxa categories[j] = categories[j].rstrip() else: @@ -95,6 +101,14 @@ def read(self, filename): errMsg = 'Failed to correctly parse line: ' + str(i+1) return profileTree, errMsg + + def isUnclassified(self, value): + """Check if value (taxon, metabolic pathway) is unclassified.""" + + # currently unclassified sequences need to be explicitly stated as + # 'unclassified' (case insensitive) or '*__unclassified' which is + # the format used by GreenGenes + return value.lower() == 'unclassified' or value.lower()[1:] == '__unclassified' def determineColumns(self, data, profileTree): firstDataRow = data[1].split('\t') @@ -111,3 +125,30 @@ def determineColumns(self, data, profileTree): headings = map(string.strip, headings) profileTree.hierarchyHeadings = headings[0:firstSampleIndex] profileTree.sampleNames = headings[firstSampleIndex:] + + def checkHierarchy(self, data, numHierarchicalLevels): + """Verify that data forms a strict hierarchy.""" + parent = defaultdict(dict) + for line in data: + lineSplit = line.split('\t') + lineSplit = map(string.strip, lineSplit) + + categories = lineSplit[0:numHierarchicalLevels] + for r, value in enumerate(categories): + if r == 0: + continue # top of hierarchy has no parent + + if self.isUnclassified(value): + continue # ignore unclassified sequences + + if r not in parent: + parent[r] = {} + + if value not in parent[r]: + parent[r][value] = categories[r-1] + else: + if parent[r][value] != categories[r-1]: + # data is not a strict hierarchy + return "Data does not form a strict hierarchy. Child %s has multiple parents (e.g., %s, %s)." % (value, parent[r][value], categories[r-1]) + return None + diff --git a/windows/STAMP_Setup.iss b/windows/STAMP_Setup.iss index 19a2397..e375d25 100644 --- a/windows/STAMP_Setup.iss +++ b/windows/STAMP_Setup.iss @@ -7,7 +7,7 @@ ; (To generate a new GUID, click Tools | Generate GUID inside the IDE.) AppId={{058AA4B2-FB5A-4D89-9134-D6E06AB9E894} AppName=STAMP -AppVerName=STAMP v2.0.6 +AppVerName=STAMP v2.0.7 AppPublisher=Donovan Parks and Robert Beiko AppPublisherURL=http://kiwi.cs.dal.ca/Software/STAMP AppSupportURL=http://kiwi.cs.dal.ca/Software/STAMP @@ -18,7 +18,7 @@ AllowNoIcons=yes LicenseFile=..\dist\LICENSE.txt InfoBeforeFile=..\dist\readme.txt OutputDir=.\install -OutputBaseFilename=STAMP_2_0_6 +OutputBaseFilename=STAMP_2_0_7 SetupIconFile=..\dist\icons\stamp.ico Compression=lzma SolidCompression=yes