Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handling of incertae sedis output from genbank #174

Draft
wants to merge 4 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/data_providers/data_format/ncbi_taxa.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ if you want to use `Crbe` in your data worksheets, rather than typing out
should be placed in the correct rank columns, and validation is carried out to check
that the rank implied by the notation matches the column rank.

When intermediate ranks are unknown they can be left blank. However, as some common
taxonomic resolution packages assign these as _Incertae sedis_ we also support this.
These ranks will be ignored in the validation process, unless they are the highest
or lowest rank provided for the taxon, in which case the validation will fail. At
present, we only check for _Incertae sedis_ output matching that of the DADA2
package. This pattern is as follows
"{first_letter_of_rank}__{last_certain_parent_name}_{rank}_Incertae_sedis".

Two special cases are that NCBI outputs typically separate out the components of
binomial and trinomial names: for example, they might return `g__Escherichia` and
`s__coli`. In order to be able to match _complete_ species and subspecies names
Expand Down
96 changes: 94 additions & 2 deletions safedata_validator/taxa.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@
# NBCI name regex
NCBI_prefix_re = re.compile("^[a-z]__")

# Mapping of rank names onto shorten versions used in Incertae sedis naming
NCBI_incertae_sedis_mapping = {"class": "cls", "order": "ord", "family": "fam"}


class GBIFError(Exception):
"""Exception class for GBIF errors.
Expand Down Expand Up @@ -278,13 +281,13 @@ def __post_init__(self):

if self.ncbi_id != leaf_ncbi_id:
raise ValueError(
f"Provided NCBI ID ({self.ncbi_id}) not does not match "
f"Provided NCBI ID ({self.ncbi_id}) does not match "
f"first ID in hierarchy ({leaf_ncbi_id})"
)

if self.parent_ncbi_id != parent_ncbi_id:
raise ValueError(
f"Provided parent NCBI ID ({self.parent_ncbi_id}) not does not match "
f"Provided parent NCBI ID ({self.parent_ncbi_id}) does not match "
f"first parent ID in hierarchy ({parent_ncbi_id})"
)

Expand Down Expand Up @@ -1717,12 +1720,23 @@ def load(self, worksheet: worksheet) -> None:
# - Tackle in taxonomic order by iterating over ordered found_ranks
# - Drop empty entries
# - Validate non-empty entries as unpadded strings
# - Check if any names indicate that the rank in question is Incertae_sedis
# - Strip any NCBI k__ notation to match entries in names.names_txt db
# field. Runs from root, so cleans genus, species, subspecies.

taxon_dict: dict[str, str] = {}
validate = True

# Find lowest rank with a value
rank_data = [row[key] for key in found_ranks]
ranks_with_names = [
index for index, element in enumerate(rank_data) if element is not None
]
if ranks_with_names:
lowest_rank = found_ranks[ranks_with_names[-1]]
else:
lowest_rank = found_ranks[-1]

for rnk in found_ranks:
# Get the name value associated with the rank
value = row[rnk]
Expand All @@ -1746,6 +1760,31 @@ def load(self, worksheet: worksheet) -> None:
LOGGER.error(f"Rank {rnk} has whitespace padding: {value!r}")
value = value_stripped

# Check whether the taxon is incertae sedis (and should be ignored for
# validation purposes)
try:
# Find last certain taxon in tree based on what was last added to
# the dictionary, if nothing has been added it should be left blank
if taxon_dict:
certain_parent = taxon_dict[list(taxon_dict.keys())[-1]]
else:
certain_parent = ""
incertae_sedis = check_incertae_sedis(
name=value,
rank=rnk,
last_certain_parent=certain_parent,
)
if incertae_sedis:
if rnk is not lowest_rank:
continue
else:
LOGGER.error(
f"Incertae sedis provided for lowest taxon rank {rnk}!"
)
validate = False
except ValueError:
validate = False

# Strip k__ notation to provide clean name_txt search input - dropping
# levels no taxonomic information is associated with the annotation (s__
# etc. entries)
Expand Down Expand Up @@ -2325,3 +2364,56 @@ def construct_bi_or_tri(higher_nm: str, lower_nm: str, tri: bool) -> str:
raise ValueError(msg)

return value


def check_incertae_sedis(name: str, rank: str, last_certain_parent: str) -> bool:
"""Check if rank name indicates that it is Incertae sedis.

These ranks are not sensible validation targets, so this function identifies them so
they can be excluded from validation.

At the moment, only a single pattern is checked. This function may have to be
extended in future to catch other patterns which indicate Incertae sedis taxa.

The following patterns are checked for:
{first_letter_of_rank}__{last_certain_parent_name}_{rank}_Incertae_sedis

Args:
name: The name of the taxon
rank: The rank of the taxon
last_certain_parent: The name of the lowest ranked taxon which is definitely
parent of this taxon

Raises:
ValueError: If the provided name ends with Incertae_sedis but either the rank
isn't one for which we expect Incertae sedis taxa, or the name doesn't follow
the expected pattern.

Returns:
A bool indicating whether the taxa is incertae sedis.
"""

if name.endswith("Incertae_sedis"):
# If this isn't a rank we accept incertae sedis for raise an error
if rank.lower() not in NCBI_incertae_sedis_mapping.keys():
msg = f"The rank {rank} of taxon {name} doesn't allow Incertae sedis taxa!"
LOGGER.error(msg)
raise ValueError(msg)

# Construct expected name and check whether it matches
expected_name = (
f"{rank[0].lower()}__{last_certain_parent}_"
f"{NCBI_incertae_sedis_mapping[rank]}_Incertae_sedis"
)
if name == expected_name:
return True
else:
msg = (
f"Taxon {name} possibly Incertae sedis but doesn't have correct "
"pattern!"
)
LOGGER.error(msg)
raise ValueError(msg)

else:
return False
Binary file modified test/fixtures/ncbi_database_truncated.sqlite
Binary file not shown.
3 changes: 2 additions & 1 deletion test/fixtures/test_ncbi_taxa_details.csv
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ Lemonia,119278
P chloro,333
Ascomycota,4890
Eurotiomycetes,147545
Dinophyceae,2864
Dinophyceae,2864
Acremonium,159075
Loading
Loading