Skip to content

Instantly share code, notes, and snippets.

@thorwhalen
Created November 8, 2024 10:46
Show Gist options
  • Select an option

  • Save thorwhalen/e8fe6c0454ab2109d4713f886b38bbda to your computer and use it in GitHub Desktop.

Select an option

Save thorwhalen/e8fe6c0454ab2109d4713f886b38bbda to your computer and use it in GitHub Desktop.
A simple but flexible content acquisition (python) function
# Note, the only two third party packages your need are:
# pip install dol
# pip install request
import os
import requests
from typing import Dict, Union, MutableMapping, KT, VT, TypeVar, Callable, Any
from functools import partial
DFLT_STORE_DIR = os.environ.get('DFLT_DOL_DOWNLOAD_DIR', '~/Downloads')
URI = VT
Dirpath = str
ContentType = TypeVar('ContentType')
StoreFunc = Callable[[KT, ContentType], None]
def is_not_none(x):
return x is not None
def acquire_content(
uri_to_content: Callable[[URI], ContentType],
uris: Dict[KT, URI] = None,
store: Union[Dirpath, MutableMapping, StoreFunc] = DFLT_STORE_DIR,
*,
save_condition: Callable[[Any], bool] = is_not_none
):
"""
Downloads and stores content from a given set of URIs.
uri_to_content is a callable function that takes a URI and returns content. This is usually set to:
- a function that reads file content, like `open(filepath).read()`
- a function that fetches URL content, like `requests.get(url).content`
However, here, we demonstrate with a simple string operation (e.g., uppercasing strings) as a substitute
to show functionality.
Note that the uri_to_content function will usually be something giving you the
contents of a file or URL.
>>> from pathlib import Path
>>> files_uri_to_content = lambda filepath: Path(filepath).read_text()
>>> urls_uri_to_content = lambda url: requests.get(url).content # doctest: +SKIP
Here, we use a simple `str.upper` to not have to deal with actual IO during tests:
Also, we'll use a dict as a store, for test simplicity purposes.
Usually, though, you'll want to use a directory or a MutableMapping as store,
or a function that stores content in a specific way.
>>> store = {}
>>> uris = {'example1': 'hello', 'example2': 'world'}
>>> acquire_content(str.upper, uris, store) # uri_to_content here is str.upper, to simulate content acquisition.
>>> store
{'example1': 'HELLO', 'example2': 'WORLD'}
Note that often you want to just fix the uri_to_content function and sometimes store.
The acquire_content acts as a function factory for your convenience. If you don't
specify uris (but at least specify `uri_to_content`), you get a function that takes
uris as the first argument, and stores the content therefrom.
>>> content_acquirer = acquire_content(str.upper, store=store) # doctest: +ELLIPSIS
>>> content_acquirer({'example3': 'foo', 'example4': 'bar'})
>>> store
{'example1': 'HELLO', 'example2': 'WORLD', 'example3': 'FOO', 'example4': 'BAR'}
# Examples that would be typical for uri_to_content:
# acquire_content(lambda filepath: open(filepath, 'rb').read(), uris, store) # Reads file content +SKIP
# acquire_content(lambda url: requests.get(url).content, uris, store) # Fetches URL content +SKIP
"""
# if uris is None, we're parametrizing the download_content function
store = ensure_store_func(store)
if uris is None:
assert callable(uri_to_content), "uri_to_content must be a callable if uris is None"
return partial(acquire_content, uri_to_content, store=store)
# Loop through uris and store the processed content
for key, uri in uris.items():
content = uri_to_content(uri)
if save_condition(content):
store(key, content)
def ensure_store_func(store: Union[Dirpath, MutableMapping, Callable]) -> StoreFunc:
"""
Ensures a store function is returned based on the type of 'store' argument provided.
- If store is a callable, it returns store directly.
- If store is a directory path, it creates a Files object (using dol) to manage file storage in that directory.
- If store is a MutableMapping, it returns the __setitem__ method of the store.
- If none of these types match, a ValueError is raised.
Examples:
>>> store = {}
>>> func = ensure_store_func(store)
>>> func('key', 'value') # should store the value in the dictionary
>>> assert store == {'key': 'value'}
>>> store = '~/Downloads'
>>> try:
... func = ensure_store_func(store)
... except ValueError:
... print("Directory does not exist, as expected.") # Simulates an invalid directory check
>>> ensure_store_func(lambda k, v: print(f"Storing {k}: {v}")) # doctest: +ELLIPSIS
<function <lambda> at ...>
# Note: For Files store handling, you'll need a valid directory:
# >>> ensure_store_func("/valid/directory/path") # Requires dol.Files +SKIP
"""
if callable(store):
return store
elif isinstance(store, str):
dirpath = os.path.expanduser(store)
if os.path.isdir(dirpath):
from dol import Files
return Files(dirpath).__setitem__
else:
raise ValueError(f"The directory path {dirpath} does not exist.")
elif isinstance(store, MutableMapping):
# If store is a MutableMapping, we'll use its __setitem__ method
store_obj = store
return store_obj.__setitem__
else:
raise ValueError("uri_to_content must be a callable, or MutableMapping, or a dir path")
# A few useful uri_to_content functions, elegantly defined as (picklable) function compositions
from dol import Pipe
from pathlib import Path
import requests
from operator import methodcaller, attrgetter
path_to_bytes = Pipe(Path, methodcaller('read_bytes'))
path_to_string = Pipe(Path, methodcaller('read_text'))
url_to_bytes = Pipe(requests.get, attrgetter('content'))
@thorwhalen
Copy link
Author

See comments about this code in this discussion

@thorwhalen
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment