Source code for tolkein.lib.toinsdc
#!/usr/bin/env python3
"""INSDC methods."""
import re
from collections import defaultdict
from urllib.parse import urlencode
import ujson
import xmltodict
from tqdm import tqdm
from .tofetch import fetch_stream
from .tofetch import fetch_url
PORTAL = "https://www.ebi.ac.uk/ena/portal/api"
[docs]def count_taxon_assembly_meta(root):
"""
Count INSDC assemblies descended from root taxon.
Args:
root (int): Root taxon taxid.
Returns:
int: Count of assemblies for taxa descended from root. Will return None on error.
"""
url = '%s/count?query="tax_tree(%s)"&result=assembly&format=json' % (
PORTAL,
str(root),
)
count = fetch_url(url)
if count is None:
return None
return int(count)
[docs]def fetch_wgs_assembly_meta(root, *, count=-1, offset=0, page=10000):
"""
Query INSDC WGS assemblies descended from root taxon.
Args:
root (int): Root taxon taxid.
count (int): Number of assemblies to return.
Default value (-1) returns all assemblies.
offset (int): Offset of first assembly to return. Defaults to 0.
page (int): Number of assemblies to fetch per API request. Defaults to 10000.
Yields:
dict: A dict of INSDC WGS assembly metadata keyed on sample accession.
"""
fields = {
"accession": "wgs_accession",
"host": "host_scientific_name",
"first_public": "wgs_first_public",
"last_updated": "wgs_last_updated",
"location": "sample_location",
"study_accession": "study_accession",
"sample_accession": "sample_accession",
"sex": "sample_sex",
}
options = {
"fields": ",".join(fields.keys()),
"format": "json",
"offset": offset,
"limit": page,
"query": '"tax_tree(%d)"' % int(root),
"result": "wgs_set",
}
returned = 0
wgs_meta = {}
while returned < count or count == -1:
url = "%s/search?%s" % (PORTAL, urlencode(options))
batch_meta = fetch_url(url)
if not batch_meta or batch_meta is None:
if not wgs_meta:
wgs_meta = None
break
for entry in ujson.loads(batch_meta):
wgs_meta[entry["sample_accession"]] = {
fields[key]: value for key, value in entry.items()
}
returned += page
options["offset"] += page
return wgs_meta
[docs]def stream_taxon_assembly_meta(root, *, count=-1, offset=0, page=10000):
"""
Query INSDC assemblies descended from root taxon.
Args:
root (int): Root taxon taxid.
count (int): Number of assemblies to return.
Default value (-1) returns all assemblies.
offset (int): Offset of first assembly to return. Defaults to 0.
page (int): Number of assemblies to fetch per API request. Defaults to 10000.
Yields:
dict: Normalised dict of INSDC metadata.
"""
fields = {
"accession": "gca_accession",
"study_accession": "study_accession",
"sample_accession": "sample_accession",
"secondary_sample_accession": "secondary_sample_accession",
"assembly_name": "assembly_name",
"assembly_title": "assembly_title",
"study_name": "study_name",
"study_title": "study_title",
"study_description": "study_description",
"tax_id": "taxon_id",
"scientific_name": "scientific_name",
"strain": "assembled_strain",
"base_count": "assembly_span",
"assembly_level": "assembly_level",
"genome_representation": "genome_representation",
"last_updated": "last_updated",
"version": "assembly_version",
"assembly_type": "assembly_type",
}
options = {
"fields": ",".join(fields.keys()),
"format": "json",
"offset": offset,
"limit": page,
"query": '"tax_tree(%d)"' % int(root),
"result": "assembly",
}
wgs_meta = fetch_wgs_assembly_meta(root)
returned = 0
while returned < count or count == -1:
url = "%s/search?%s" % (PORTAL, urlencode(options))
batch_meta = fetch_url(url)
if not batch_meta or batch_meta is None:
break
for entry in ujson.loads(batch_meta):
entry_meta = {fields[key]: value for key, value in entry.items()}
if entry_meta["sample_accession"] in wgs_meta:
entry_meta = {**entry_meta, **wgs_meta[entry_meta["sample_accession"]]}
yield entry_meta
returned += page
options["offset"] += page