#!/usr/bin/env python3
"""Taxonomy methods."""
import re
from .tofile import open_file_handle
[docs]def parse_ncbi_nodes_dmp(path):
"""Parse NCBI format nodes.dmp file."""
nodes = {}
with open_file_handle(path) as fh:
for line in fh:
taxon_id, parent, taxon_rank, *_ignore = re.split(r"\s*\|\s*", line)
nodes[taxon_id] = {
"parent": parent,
"taxon_rank": taxon_rank,
"taxon_names": [],
}
nodes["1"] = {"taxon_rank": "no rank", "taxon_names": []}
return nodes
[docs]def parse_ncbi_names_dmp(path, nodes):
"""Parse names.dmp file and add to nodes dict."""
with open_file_handle(path) as fh:
for line in fh:
taxon_id, name, _unique, name_class, *_ignore = re.split(r"\s*\|\s*", line)
if taxon_id in nodes:
if name_class == "scientific name":
nodes[taxon_id].update(
{
"taxon_id": taxon_id,
"scientific_name": name,
}
)
nodes[taxon_id]["taxon_names"].append(
{"name": name, "class": name_class}
)
[docs]def stream_nodes(nodes, roots):
"""Add lineage info and stream taxonomy nodes."""
for taxon_id, obj in nodes.items():
lineage = obj.copy()
lineage.update(
{
"lineage": [
{
"taxon_id": obj["taxon_id"],
"taxon_rank": obj["taxon_rank"],
"scientific_name": obj["scientific_name"],
"node_depth": 0,
}
]
}
)
descendant = False
if taxon_id in roots:
descendant = True
depth = 0
while "parent" in obj and obj["parent"] in nodes:
depth += 1
parent = obj["parent"]
obj = nodes[parent]
lineage["lineage"].append(
{
"taxon_id": obj["taxon_id"],
"taxon_rank": obj["taxon_rank"],
"scientific_name": obj["scientific_name"],
"node_depth": depth,
}
)
if obj["taxon_id"] in roots:
descendant = True
break
if descendant:
yield "taxon_id-%s" % taxon_id, lineage
[docs]def parse_ncbi_taxdump(path, root=None):
"""Expand lineages from nodes dict."""
if root is None:
root = ["1"]
if not isinstance(root, list):
root = [root]
roots = list(map(str, root))
nodes = parse_ncbi_nodes_dmp("%s/nodes.dmp" % path)
parse_ncbi_names_dmp("%s/names.dmp" % path, nodes)
yield from stream_nodes(nodes, roots)
[docs]def add_xrefs(names, xrefs):
"""Add xrefs to a list of taxon names."""
dbs = {
"gbif": {"source": "GBIF", "stub": "https://www.gbif.org/species/"},
"irmng": {
"source": "IRMNG",
"stub": "https://www.irmng.org/aphia.php?p=taxdetails&id=",
},
"ott": {
"source": "OTT",
"stub": "https://tree.opentreeoflife.org/opentree/argus/ottol@",
},
"ncbi": {
"source": "NCBI",
"stub": "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=",
},
"silva": {
"source": "SILVA",
"stub": "https://www.arb-silva.de/browser/ssu-138.1/",
},
"worms": {
"source": "WoRMS",
"stub": "http://www.marinespecies.org/aphia.php?p=taxdetails&id=",
},
}
for xref in xrefs:
source, accession = xref.split(":")
if source in dbs:
names.append(
{
"name": accession,
"class": "xref",
"source": dbs[source]["source"],
"source_url_stub": dbs[source]["stub"],
}
)
[docs]def parse_ott_nodes_dmp(path):
"""Parse Open tree of Life taxonomy.tsv file."""
nodes = {}
with open_file_handle(path) as fh:
for line in fh:
taxon_id, parent, taxon_name, taxon_rank, source_info, *_ignore = re.split(
r"\s*\|\s*", line
)
if taxon_id == "uid":
continue
xrefs = ["ott:%s" % taxon_id]
taxon_id = "ott_%s" % taxon_id
nodes[taxon_id] = {
"parent": "ott_%s" % parent,
"taxon_rank": taxon_rank,
"taxon_names": [],
}
if taxon_name:
nodes[taxon_id].update(
{
"taxon_id": taxon_id,
"scientific_name": taxon_name,
}
)
nodes[taxon_id]["taxon_names"].append(
{"name": taxon_name, "class": "scientific_name"}
)
if source_info:
xrefs += [re.sub(r"#\d+", "", xref) for xref in source_info.split(",")]
add_xrefs(nodes[taxon_id]["taxon_names"], xrefs)
return nodes
[docs]def parse_ott_names_dmp(path, nodes):
"""Parse synonyms.tsv file and add to nodes dict."""
with open_file_handle(path) as fh:
for line in fh:
name, taxon_id, name_class, *_ignore = re.split(r"\s*\|\s*", line)
if taxon_id == "uid":
continue
taxon_id = "ott_%s" % taxon_id
if taxon_id in nodes:
nodes[taxon_id]["taxon_names"].append(
{"name": name, "class": name_class}
)
[docs]def parse_ott_taxdump(path, root=None):
"""Expand lineages from nodes dict."""
if root is None:
root = ["ott_805080"]
if not isinstance(root, list):
root = [root]
roots = list(map(str, root))
nodes = parse_ott_nodes_dmp("%s/taxonomy.tsv" % path)
parse_ott_names_dmp("%s/synonyms.tsv" % path, nodes)
yield from stream_nodes(nodes, roots)
[docs]def parse_taxonomy(taxonomy_type, path, root=None):
"""Parse taxonomy into list of dicts."""
parsers = {"ncbi": parse_ncbi_taxdump, "ott": parse_ott_taxdump}
parser = parsers.get(taxonomy_type.lower(), None)
if parser is None:
return None
return parser(path, root)