Source code for molsysmt.form.file_bcif_gz.download

import os
import time
import random
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
from molsysmt._private.files_and_directories import temp_filename
from molsysmt._private.warnings import warn, DownloadWarning

[docs] def download(pdb_id=None, output_filename=None, tempfile=False, wwPDB_Partner='RCSB PDB', retries=5, timeout=30, backoff_base=2.0): """ Download <pdb_id>.bcif.gz from the specified wwPDB partner, handling rate limiting (HTTP 429) and transient network errors with retries and exponential backoff + jitter. Parameters ---------- pdb_id : str 4-character PDB ID (case-insensitive). tempfile : bool, default True If True, create a temporary filename via MolSysMT's temp_filename(). If False and output_filename is None, the output will be "<pdb_id>.bcif.gz" in the current directory. wwPDB_Partner : str, default 'RCSB PDB' wwPDB partner source. Currently only 'RCSB PDB' is supported. output_filename : str or None, default None If provided, write to this path. Otherwise it is computed from `tempfile`/`pdb_id`. retries : int, default 5 Maximum number of attempts before failing. timeout : int or float, default 30 Per-request timeout (seconds). backoff_base : float, default 2.0 Exponential backoff base. Retry wait ~ (backoff_base ** attempt) + jitter. Returns ------- str Path to the downloaded file. Raises ------ NotImplementedError If the partner is not supported. RuntimeError If retries are exhausted or a non-recoverable HTTP error occurs (e.g., 404). """ if wwPDB_Partner != 'RCSB PDB': raise NotImplementedError("Only 'RCSB PDB' is supported at the moment.") # Determine output filename if output_filename is None: if tempfile: output_filename = temp_filename(extension="bcif.gz") else: output_filename = f"{pdb_id}.bcif.gz" url = f"https://models.rcsb.org/{pdb_id}.bcif.gz" # Polite User-Agent; some servers prefer/require it headers = {"User-Agent": "MolSysMT/1.0 (+https://uibcdf.org) Python-urllib"} last_err = None for attempt in range(retries): try: req = Request(url, headers=headers) with urlopen(req, timeout=timeout) as resp, open(output_filename, "wb") as fh: # Stream to file in chunks to avoid loading the entire response in memory while True: chunk = resp.read(1024 * 64) if not chunk: break fh.write(chunk) return output_filename # success except HTTPError as e: last_err = e # Retry on rate limiting (429) or server errors (5xx) if e.code == 429 or (500 <= e.code < 600): # Clean up any partial file if os.path.exists(output_filename): try: os.remove(output_filename) except OSError: pass # Exponential backoff with small jitter wait = (backoff_base ** attempt) + random.uniform(0, 0.5) warn( f"Download of {pdb_id}.bcif.gz was rate-limited or the server failed " f"(HTTP {e.code}). Retrying in {wait:.1f}s…", DownloadWarning, ) time.sleep(wait) continue else: # Non-recoverable HTTP error (e.g., 400/404) if os.path.exists(output_filename): try: os.remove(output_filename) except OSError: pass raise RuntimeError( f"Failed to download {pdb_id}.bcif.gz (HTTP {e.code}). URL: {url}" ) from e except URLError as e: # Transient network error: retry with backoff last_err = e if os.path.exists(output_filename): try: os.remove(output_filename) except OSError: pass wait = (backoff_base ** attempt) + random.uniform(0, 0.5) warn( f"Network issue while downloading {pdb_id}.bcif.gz " f"({getattr(e, 'reason', e)}). Retrying in {wait:.1f}s…", DownloadWarning, ) time.sleep(wait) continue except Exception as e: # Unexpected error: abort cleanly if os.path.exists(output_filename): try: os.remove(output_filename) except OSError: pass raise RuntimeError( f"Unexpected error while downloading {pdb_id}.bcif.gz: {e}" ) from e # Retries exhausted raise RuntimeError( f"Could not download {pdb_id}.bcif.gz after {retries} attempts. Last error: {last_err}" )
# from molsysmt._private.files_and_directories import temp_filename # from urllib.request import urlretrieve # # output = None # # if tempfile: # output_filename=temp_filename(extension="bcif.gz") # # if wwPDB_Partner=='RCSB PDB': # # filename = pdb_id+'.bcif.gz' # fullurl = 'https://models.rcsb.org/'+filename # # if output_filename is None: # output_filename = filename # # urlretrieve(fullurl, output_filename) # # output = output_filename # # else: # # raise NotImplementedError() # # return output