"""
Download
--------
"""
import logging
import warnings
import fsspec
import pooch
logger = logging.getLogger("fetch_data")
warnings.filterwarnings("ignore", category=RuntimeWarning)
[docs]def download(
url="",
login={},
dest="./",
n_jobs=1,
use_cache=True,
cache_name="_urls_{hash}.cache",
verbose=False,
log_name="_downloads.log",
decompress=True,
create_readme=True,
readme_name="README.md",
**kwargs,
):
"""Core function to fetch data from a url with a wildcard or as a list.
Allows for parallel download of data that can be set with a single url
containing a wild card character or a list of urls. If the wild card is
used, file names will be cached. A README.txt file will automatically be
generated in `dest`, along with a downloading log, and a url cachce (if url
is a string).
:code:`download` is a Frankenstein mashup of :code:`fsspec` and
:code:`pooch` to fetch files. It is tricky to download password protected
files with :code:`fsspec` and :code:`pooch` does not allow for wildcard
listed downloads. If the url input is only a list, :code:`fsspec` will not
be used and only :code:`pooch`. But you can still download in parallel with
this script.
Args:
url (str, list): URL/s to be downloaded.
If URL is a string and contains a wildcard (*), will try to search
for files on the server. But this might not be possible with some
HTTP websites. Caching will be used in this case. Will fail if
no files could be fetched from the server.
login (dict): required if :code:`username` and :code:`passwords` are
required for protocol
dest (str): where the files will be saved to. String formatting
supported (as with url)
n_jobs (int): the number of parallel downloads. Will not show progress
bar when n_jobs > 1. Not allowed to be larger than 8.
use_cache (bool): if set to True, will use cached url list instead of
fetching a new list. This is useful for updating data
cache_name (str): the file name to which data will be cached. This file
is stored relative to :code:`dest`. The file is a simple text file showing
a url for each line. This will not be used if a list is passed to url.
verbose (bool / int): if verbose is False, logging level set to ERROR (40)
if verbose is True, logging level set to 15
if verbose is intiger, then sets logging level directly.
See the logging module for more information.
log_name (str): the file name to which logging will be saved. The file is stored
relative to :code:`dest`. Logging level can be set with the :code:`verbose` arg.
create_readme (bool): will create a readme in the destination folder
readme_name (str): default readme file name. can change the path relative to dest
kwargs (key=value): are keyword replacements for any values set in the
url (if url is no a list) and dest strings
Returns:
list:
a flattened list of file paths to where the data has been downloaded.
If inputs are compressed, the names of the uncompressed files will be given.
"""
from collections import defaultdict
from .utils import get_kwargs, log_to_file, flatten_list, commong_substring
from pathlib import Path as path
# if any placeholders in dest, then fill them out
dest = dest.format_map(kwargs)
dest = str(path(dest).expanduser())
# get all the inputs and store them as kwargs
kwargs = {**get_kwargs(), **kwargs}
# set logging level to 15 if verbose, else 40
if isinstance(verbose, bool):
logging_level = 15 if verbose else 40
elif isinstance(verbose, int):
logging_level = verbose
else:
raise TypeError("verbose must be bool or intiger")
logging.getLogger("fetch_data").setLevel(logging_level)
# Setting the logging file name and storing to kwargs for readme
log_fname = f"{dest}/{log_name}"
if logging_level < 40:
log_to_file(log_fname)
kwargs.update({"download_logging": log_fname})
# creating the readme before downloading
if create_readme:
create_download_readme(readme_name, **kwargs)
# caching ignored if input is a list
if isinstance(url, (list, tuple)):
urls = [u.format_map(kwargs) for u in url]
# fetches a list of the files in the directory
elif "*" in url:
urls = get_url_list(
url=url.format_map(kwargs),
use_cache=use_cache,
cache_path=f"{dest}/{cache_name}",
**login, # will not pass anything if empty
)
else:
# will simply use url as is
urls = [url.format_map(kwargs)]
logger.log(
20, f"{len(urls): >3} files at {commong_substring(urls).format_map(kwargs)}"
)
if len(urls) == 0:
return []
logger.log(20, f"Files will be saved to {dest}")
flist = download_urls(
urls,
n_jobs=n_jobs,
dest_dir=dest,
login=login,
decompress=decompress,
)
if flist is None:
raise ValueError("Files could not be downloaded")
return flatten_list(flist)
[docs]def get_url_list(
url,
username=None,
password=None,
use_cache=True,
cache_path="./_urls_{hash}.cache",
**kwargs,
):
"""If a url has a wildcard (*) value, remote files will be searched.
Leverages off the `fsspec` package. This doesn't work for all HTTP urls.
Parameters:
url (str): If a url has a wildcard (*) value, remote files will be
searched for
username (str): if required for given url and protocol (e.g. FTP)
password (str): if required for given url and protocol (e.g. FTP)
cache_path (str): the path where the cached files will be stored. Has a
special case where `{hash}` will be replaced with a hash based on
the URL.
use_cache (bool): if there is a file with cached remote urls, then
those values will be returned as a list
Returns:
list: a sorted list of urls
"""
from pathlib import Path as posixpath
from urllib.parse import urlparse
from .utils import make_hash_string
if "*" not in url:
return [url]
if "{hash}" in cache_path:
cache_path = cache_path.format(hash=make_hash_string(url))
if use_cache:
cache_path = posixpath(cache_path)
if cache_path.is_file():
with open(cache_path, "r") as file:
flist = file.read().split("\n")
logger.log(15, f"Fetched {len(flist)} files from flist cache: {cache_path}")
return sorted(flist)
purl = urlparse(url)
protocol = purl.scheme
host = purl.netloc
path = purl.path
logger.log(15, f"Fetching filenames from {url}")
props = {"protocol": protocol}
if not protocol.startswith("http"):
props.update({"host": host})
if username is not None:
props["username"] = username
if password is not None:
props["password"] = password
fs = fsspec.filesystem(**props)
if protocol.startswith("http"):
path = f"{protocol}://{host}/{path}"
try:
flist = fs.glob(path)
except AttributeError:
raise FileNotFoundError(f"The given url does not exist: {url}")
except TypeError:
raise KeyError(
f"The host {protocol}://{host} does not accept username/password"
)
if not protocol.startswith("https"):
flist = [f"{protocol}://{host}{f}" for f in fs.glob(path)]
# writing url list to cache file
if use_cache:
cache_path.parent.mkdir(exist_ok=True, parents=True)
with open(cache_path, "w") as out_file:
out_file.write("\n".join(flist))
logger.log(15, f"Cached {len(flist)} urls to: {cache_path}")
logger.debug(flist)
return sorted(flist)
[docs]def download_urls(
urls,
downloader=None,
n_jobs=8,
dest_dir=".",
login={},
decompress=True,
**kwargs,
):
"""
Downloads the given list of urls to a specified destination path using
the `pooch` package in Python.
NOTE: `fsspec` is not used as it fails for some FTP and SFTP protocols.
Args:
urls (list): the list of URLS to download - may not contain wildcards
dest_dir (str): the location where the files will be downloaded to. May contain
date formatters that are labelled with "{t:%fmt} to create subfolders
date_format (str): the format of the date in the urls that will be used to
fill in the date formatters in `dest_dir` kwarg. Matches limited to
1970s to 2020s
kwargs (key=value): will be passed to pooch.retrieve. Can be used to set
the downloader with username and password and the processor for unzipping.
See `choose_downloader` for more info.
Returns:
list: file names of downloaded urls
"""
def pooch_retrieve_handling(kwargs):
"""
An internal function to process errors and avoid failed downloads:
- using the progressbar is not allowed by the server
- will detect if permissions are not sufficient for downloading
Parameters
----------
kwargs: dict
a dictionary containing all the info required to download data
Returns
-------
int:
0 = success
1 = failure
str:
retrieved filename, if failed returns the URL
"""
from requests import HTTPError
pooch.get_logger().setLevel(1000)
url = kwargs.get("url")
try: # catch errors
logger.log(15, f"retrieving {url}")
fname = pooch.retrieve(**kwargs)
return 0, fname
except KeyboardInterrupt as e:
raise (e)
except HTTPError as e:
message = f"ERROR: URL not found: {url}. "
logger.log(20, message)
return 1, url
try:
# this is for when the server does not allow the file size to be fetched
kwargs["downloader"].progressbar = False
return 0, pooch.retrieve(**kwargs)
except:
pass
# this will raise the error
try:
pooch.retrieve(**kwargs)
except Exception as e:
# catches errors and returns useful information to the logger
if "550" in str(e):
message = f"ERROR: Check file permissions: {url}. "
logger.log(20, message)
return 1, url
finally:
return 1, url
from pooch import Unzip
progressbar = (1 // n_jobs) & (logging.getLogger("fetch_data").level <= 20)
download_args = []
for url in urls:
download_args += (
dict(
url=url,
known_hash=None,
fname=url.split("/")[-1],
path=dest_dir,
processor=choose_processor(url) if decompress else None,
downloader=choose_downloader(url, login=login, progress=progressbar),
),
)
# n_jobs will default to number of urls if less than given n_jobs
n_jobs = min([n_jobs, len(download_args)])
if n_jobs == 1: # will not use joblib if n_jobs=1
flist = [pooch_retrieve_handling(d) for d in download_args]
elif 1 < n_jobs <= 8: # uses joblib for parallel downloads
from joblib import Parallel, delayed
flist = Parallel(n_jobs=n_jobs, prefer="threads")(
delayed(pooch_retrieve_handling)(d) for d in download_args
)
else: # max set to 8 for safety - sometimes too many connections
raise Exception("n_jobs must be between 1 and 8 to avoid too many requests")
failed = [f for o, f in flist if o > 0]
passed = [f for o, f in flist if o == 0]
logger.info(
f"SUMMARY: Retrieved={len(passed)}, Failed={len(failed)} listing failed below: \n"
+ "\n".join(failed)
)
return passed
[docs]def choose_downloader(url, login={}, progress=True):
"""
Will automatically select the correct downloader for the given url. Pass
result to pooch.retrieve(downloader=downloader())
Args:
url (str): the path of a url
login (dict): can contain either `username` and `password` OR `cookies`
which are passed to the relevant downloader in pooch.
progress (bool): a progressbar will be shown if True - requires tqdm
Returns:
pooch.Downloader: with the items in login passed to the downloader
as kwargs and progressbar set to True (if set)
"""
from urllib.parse import urlparse as parse_url
import pooch
known_downloaders = {
"ftp": pooch.FTPDownloader,
"http": pooch.HTTPDownloader,
"https": pooch.HTTPDownloader,
}
parsed_url = parse_url(url)
if parsed_url.scheme not in known_downloaders:
raise ValueError(
f"Unrecognized URL protocol '{parsed_url.scheme}' in '{url}'. "
f"Must be one of {known_downloaders.keys()}."
)
downloader = known_downloaders[parsed_url.scheme]
# if http, then use different password implementation
if url.lower().startswith("http") and (login != {}):
if "cookies" in login:
login = dict(cookies=login["cookies"])
elif "username" in login and "password" in login:
login = dict(auth=(login["username"], login["password"]))
else:
raise KeyError("`login` can only contain (username, password) OR cookies")
# calling the function to prepare
downloader = downloader(progressbar=progress, **login)
return downloader
[docs]def choose_processor(url):
"""
chooses the processor to uncompress if required
"""
known_processors = {
pooch.Decompress(): (".gz2", ".gz"),
pooch.Untar(): (".tar", ".tgz", ".tar.gz"),
pooch.Unzip(): (".zip",),
None: "*",
}
chosen = None
for processor, extensions in known_processors.items():
for ext in extensions:
if ext in url.lower():
chosen = processor
return chosen
[docs]def create_download_readme(fname, **entry):
"""
Creates a README file based on the information in the source dictionary.
Parameters
----------
name: str
name to which file will be written
**entry: kwargs
must contain
"""
import inspect
from pathlib import Path as posixpath
from warnings import warn
from .utils import make_readme_file, commong_substring
dest = entry.get("dest")
# readme will always be overwritten
readme_fname = posixpath(f"{dest}/{fname}")
readme_fname.parent.mkdir(parents=True, exist_ok=True)
url = entry.get("url", None)
if isinstance(url, (list, tuple)):
url = commong_substring(url) + "..."
readme_text = make_readme_file(
entry.get("name", ""),
url,
entry.get("meta", {}),
short_info_len_limit=max([120, len(url)]),
)
with open(readme_fname, "w") as file:
file.write(readme_text)