Python script recovers the genbank ids for all the nucleotide entries linked to a taxon id.


This python script recovers the genbank ids for all the nucleotide entries linked to a taxon id. The number of requests is minimized using the retmax and retstart parameters provided by the Entrez Utilities.
taxid_2_gbids.py
Python

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
#!/usr/bin/env python
 
import xml.etree.ElementTree as ET
import sys, urllib, urllib2
 
eutils_base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
 
def get_ids(taxid):
accession_numbers =[]
retstart = 0
iteration_step = 10000
while True:
result = esearch(db = "nucleotide", term = "txid%s[Organism:exp]"%taxid, retstart = retstart, retmax = iteration_step)
try:
result = ET.fromstring(result)
ids = []
if result.find('IdList') is not None:
for id in result.find('IdList').findall('Id'):
ids.append(id.text)
 
result = esummary(db = "nucleotide", ids = ids, retmax = iteration_step)
result = ET.fromstring(result)
for docsum in result.findall('DocSum'):
for item in docsum.findall("Item[@Name='Caption']"):
accession_numbers.append(item.text)
else:
break
except Exception, e:
print e
retstart += iteration_step
return accession_numbers
 
def esearch(db, term, retstart = 0, retmax = 20):
response = urllib.urlopen("%sesearch.fcgi?db=%s&term=%s&retstart=%i&retmax=%i"%(eutils_base_url, db, term, retstart, retmax))
content = str(response.read())
response.close()
return content
 
def esummary(db, ids, retstart = 0, retmax = 20):
data = {
'db':db,
'id':','.join(ids)
}
data = urllib.urlencode(data)
req = urllib2.Request("%sesummary.fcgi"%eutils_base_url, data)
response = urllib2.urlopen(req)
content = str(response.read())
response.close()
return content
 
if __name__ == '__main__':
taxid = None
 
if "-id" in sys.argv:
taxid = sys.argv[sys.argv.index("-id")+1]
 
if not taxid:
print "Usage: taxid_2_gbids.py -id taxid"
print "Example: taxid_2_gbids.py -id 4754"
sys.exit(-1)
 
ids = get_ids(taxid)
print ids
print "%i ids found..."%len(ids)