Skip to content

Instantly share code, notes, and snippets.

@borwickatuw
Last active December 5, 2025 14:32
Show Gist options
  • Select an option

  • Save borwickatuw/784fea025d17eb9770e8d5768db50965 to your computer and use it in GitHub Desktop.

Select an option

Save borwickatuw/784fea025d17eb9770e8d5768db50965 to your computer and use it in GitHub Desktop.
Initial Python wrapper for Hyku's SWORD endpoint. No warranty
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
Wrapper around the SWORD API, targeting Hyku's SWORD API specifically.
"""
import io
import logging
import mimetypes
import os
import re
from dataclasses import dataclass
from functools import wraps
import lxml.etree
import requests
logger = logging.getLogger()
class WorkException(Exception):
"""
Raised when there is an issue with a work value e.g. a string
that should be a UUID that isn't.
"""
def __init__(self, msg, errors):
super().__init__(msg)
self.errors = errors
UUIDISH_RE = re.compile(r"^[a-z0-9\-]+$")
@dataclass
class WorkToUpload:
"""
Track what we need in order to upload a work.
"""
on_behalf_of: str
work_type: str
title: str
creator: str
admin_set: str = "default"
user_collection: str | None = None
visibility: str | None = None
file_path: str | None = None
def validate(self):
"""
Raises WorkException if there are issues with any
attributes, e.g. if `admin_set` is not set to a valid-looking
UUID.
"""
# Philosophy is to do *all* the checking and return a list of
# errors.
errors = []
if self.admin_set and not UUIDISH_RE.match(self.admin_set):
errors.append(
f"admin_set is set to `{self.admin_set}`, which doesn't look like a UUID"
)
if self.user_collection and not UUIDISH_RE.match(self.user_collection):
errors.append(
f"user_collection is set to `{self.user_collection}`, which doesn't look like a UUID"
)
if self.file_path and not os.path.exists(self.file_path):
errors.append(
f"file_path is set to `{self.file_path}`, which does not exist"
)
if errors:
raise WorkException("; ".join(errors), errors=errors)
def create_metadata_xml(metadata_dict: dict):
"""
Given a dictionary, create a string that's a valid XML
document that will be accepted by the SWORD upload work API
endpoint.
"""
metadata_elt = lxml.etree.Element("metadata")
# Adds items under `metadata_elt`:
for k, v in metadata_dict.items():
lxml.etree.SubElement(metadata_elt, k).text = v
return lxml.etree.tostring(
metadata_elt, pretty_print=True, xml_declaration=True, encoding="UTF-8"
)
# Mapping from the the WorkToUpload attribute (key) to the XML
# metadata field name (value).
METADATA_FIELD_MAPPING = {
"user_collection": "member_of_collection_ids",
"visibility": "visibility",
}
def get_upload_metadata_for_work(work_to_upload):
"""
Takes a `WorkToUpload`, figures out the metadata ditionary,
and then passes that to `create_metadata_xml`.
"""
metadata_attrs = {
"title": work_to_upload.title,
"creator": work_to_upload.creator,
}
# This is a mapping from `WorkToUpload` attribute to upload
# metadata attributes. Only populate attributes when they are true
# per Python.
for attr, tag in METADATA_FIELD_MAPPING.items():
work_val = getattr(work_to_upload, attr)
if not work_val:
continue
metadata_attrs[tag] = work_val
return create_metadata_xml(metadata_attrs)
class ConnectionException(Exception):
"""
Raised whenever there's an issue talking to the Confluence API.
"""
def __init__(self, content, response):
self.content = content
self.response = response
class NotFoundException(ConnectionException):
pass
def file_upload_tuple(label, path):
"""
Generates a tuple that request's files= parameter will accept.
This calculates the filename and mime type from the path.
As of 2025-12-05 it guesses the mime type only from the extension,
not from file contents.
DANGER: return filehandles need to be closed manually!
"""
# DANGER you need to close the files!
file_name = os.path.basename(path)
# Copilot helped with this:
mime_type, _ = mimetypes.guess_type(file_name)
mime_type = mime_type or "application/octet-stream"
return (label, (file_name, open(path, "rb"), mime_type))
def wrap_raw(func):
"""
Stole this from oap_o365.
This function wrapper wraps a function that returns a requests
response object. The response object is checked for `resp.ok`. If
it's not OK then it raises an exception. Otherwise it returns the
response.
"""
@wraps(func)
def wrapper(*args, **kwargs):
"""
Call the original function, but check the response object it returns.
If `response.ok` is not true, then rais either a
NotFoundException or a ConnectionException.
"""
resp = func(*args, **kwargs)
if not resp.ok:
logger.error(
"%s --> %s error: %s", resp.url, resp.status_code, resp.content
)
if resp.status_code == 404:
raise NotFoundException(resp.content, response=resp)
raise ConnectionException(
"Received a {}".format(resp.status_code), response=resp
)
return resp
return wrapper
def wrap_xml(func):
"""
Take the response content and parse it as XML.
"""
@wraps(func)
def wrapper(*args, **kwargs):
"""
Call the original function, but check the response object it returns.
If `response.ok` is not true, then rais either a
NotFoundException or a ConnectionException.
"""
resp = func(*args, **kwargs)
return lxml.etree.fromstring(resp.content)
return wrapper
class HykuSWORDAPI:
"""
Wrapper for the Hyku SWORD API.
"""
def __init__(self, api_base, api_key):
if not api_base.endswith("/"):
api_base += "/"
self.API_BASE = api_base
self.API_KEY = api_key
self.session = requests.Session()
def headers(self):
return {"Api-key": self.API_KEY}
@wrap_xml
@wrap_raw
def servicedocument(self):
url = self.API_BASE + "service_document"
return self.session.get(url, headers=self.headers())
@wrap_xml
@wrap_raw
def upload_work(self, work_to_upload: WorkToUpload):
work_to_upload.validate()
url = self.API_BASE + f"collections/{work_to_upload.admin_set}/works/"
headers = self.headers()
headers.update(
{
"On-Behalf-Of": work_to_upload.on_behalf_of,
"Hyrax-Work-Model": work_to_upload.work_type,
"In-Progress": "false",
"Packaging": "http://purl.org/net/sword/package/Binary",
}
)
# List of files to upload.
files = [
(
"metadata",
(
"metadata.xml",
# Apparently it's OK to use a string:
get_upload_metadata_for_work(work_to_upload),
"application/xml",
),
)
]
if work_to_upload.file_path:
files.append(
file_upload_tuple(label="payload", path=work_to_upload.file_path)
)
resp = self.session.post(url, headers=headers, files=files)
# Super hack but this ensures the file handle is closed, if it
# was open:
for _, triple in files:
file_obj = triple[1]
if isinstance(file_obj, io.IOBase):
file_obj.close()
return resp
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment