Source code for molsysmt.form.file_bcif_gz.download
import os
import time
import random
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
from molsysmt._private.files_and_directories import temp_filename
from molsysmt._private.warnings import warn, DownloadWarning
[docs]
def download(pdb_id=None, output_filename=None, tempfile=False, wwPDB_Partner='RCSB PDB',
retries=5, timeout=30, backoff_base=2.0):
"""
Download <pdb_id>.bcif.gz from the specified wwPDB partner, handling rate limiting (HTTP 429)
and transient network errors with retries and exponential backoff + jitter.
Parameters
----------
pdb_id : str
4-character PDB ID (case-insensitive).
tempfile : bool, default True
If True, create a temporary filename via MolSysMT's temp_filename(). If False and
output_filename is None, the output will be "<pdb_id>.bcif.gz" in the current directory.
wwPDB_Partner : str, default 'RCSB PDB'
wwPDB partner source. Currently only 'RCSB PDB' is supported.
output_filename : str or None, default None
If provided, write to this path. Otherwise it is computed from `tempfile`/`pdb_id`.
retries : int, default 5
Maximum number of attempts before failing.
timeout : int or float, default 30
Per-request timeout (seconds).
backoff_base : float, default 2.0
Exponential backoff base. Retry wait ~ (backoff_base ** attempt) + jitter.
Returns
-------
str
Path to the downloaded file.
Raises
------
NotImplementedError
If the partner is not supported.
RuntimeError
If retries are exhausted or a non-recoverable HTTP error occurs (e.g., 404).
"""
if wwPDB_Partner != 'RCSB PDB':
raise NotImplementedError("Only 'RCSB PDB' is supported at the moment.")
# Determine output filename
if output_filename is None:
if tempfile:
output_filename = temp_filename(extension="bcif.gz")
else:
output_filename = f"{pdb_id}.bcif.gz"
url = f"https://models.rcsb.org/{pdb_id}.bcif.gz"
# Polite User-Agent; some servers prefer/require it
headers = {"User-Agent": "MolSysMT/1.0 (+https://uibcdf.org) Python-urllib"}
last_err = None
for attempt in range(retries):
try:
req = Request(url, headers=headers)
with urlopen(req, timeout=timeout) as resp, open(output_filename, "wb") as fh:
# Stream to file in chunks to avoid loading the entire response in memory
while True:
chunk = resp.read(1024 * 64)
if not chunk:
break
fh.write(chunk)
return output_filename # success
except HTTPError as e:
last_err = e
# Retry on rate limiting (429) or server errors (5xx)
if e.code == 429 or (500 <= e.code < 600):
# Clean up any partial file
if os.path.exists(output_filename):
try:
os.remove(output_filename)
except OSError:
pass
# Exponential backoff with small jitter
wait = (backoff_base ** attempt) + random.uniform(0, 0.5)
warn(
f"Download of {pdb_id}.bcif.gz was rate-limited or the server failed "
f"(HTTP {e.code}). Retrying in {wait:.1f}s…",
DownloadWarning,
)
time.sleep(wait)
continue
else:
# Non-recoverable HTTP error (e.g., 400/404)
if os.path.exists(output_filename):
try:
os.remove(output_filename)
except OSError:
pass
raise RuntimeError(
f"Failed to download {pdb_id}.bcif.gz (HTTP {e.code}). URL: {url}"
) from e
except URLError as e:
# Transient network error: retry with backoff
last_err = e
if os.path.exists(output_filename):
try:
os.remove(output_filename)
except OSError:
pass
wait = (backoff_base ** attempt) + random.uniform(0, 0.5)
warn(
f"Network issue while downloading {pdb_id}.bcif.gz "
f"({getattr(e, 'reason', e)}). Retrying in {wait:.1f}s…",
DownloadWarning,
)
time.sleep(wait)
continue
except Exception as e:
# Unexpected error: abort cleanly
if os.path.exists(output_filename):
try:
os.remove(output_filename)
except OSError:
pass
raise RuntimeError(
f"Unexpected error while downloading {pdb_id}.bcif.gz: {e}"
) from e
# Retries exhausted
raise RuntimeError(
f"Could not download {pdb_id}.bcif.gz after {retries} attempts. Last error: {last_err}"
)
# from molsysmt._private.files_and_directories import temp_filename
# from urllib.request import urlretrieve
#
# output = None
#
# if tempfile:
# output_filename=temp_filename(extension="bcif.gz")
#
# if wwPDB_Partner=='RCSB PDB':
#
# filename = pdb_id+'.bcif.gz'
# fullurl = 'https://models.rcsb.org/'+filename
#
# if output_filename is None:
# output_filename = filename
#
# urlretrieve(fullurl, output_filename)
#
# output = output_filename
#
# else:
#
# raise NotImplementedError()
#
# return output