Last active
December 5, 2025 14:32
-
-
Save borwickatuw/784fea025d17eb9770e8d5768db50965 to your computer and use it in GitHub Desktop.
Initial Python wrapper for Hyku's SWORD endpoint. No warranty
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| """ | |
| Wrapper around the SWORD API, targeting Hyku's SWORD API specifically. | |
| """ | |
| import io | |
| import logging | |
| import mimetypes | |
| import os | |
| import re | |
| from dataclasses import dataclass | |
| from functools import wraps | |
| import lxml.etree | |
| import requests | |
| logger = logging.getLogger() | |
| class WorkException(Exception): | |
| """ | |
| Raised when there is an issue with a work value e.g. a string | |
| that should be a UUID that isn't. | |
| """ | |
| def __init__(self, msg, errors): | |
| super().__init__(msg) | |
| self.errors = errors | |
| UUIDISH_RE = re.compile(r"^[a-z0-9\-]+$") | |
| @dataclass | |
| class WorkToUpload: | |
| """ | |
| Track what we need in order to upload a work. | |
| """ | |
| on_behalf_of: str | |
| work_type: str | |
| title: str | |
| creator: str | |
| admin_set: str = "default" | |
| user_collection: str | None = None | |
| visibility: str | None = None | |
| file_path: str | None = None | |
| def validate(self): | |
| """ | |
| Raises WorkException if there are issues with any | |
| attributes, e.g. if `admin_set` is not set to a valid-looking | |
| UUID. | |
| """ | |
| # Philosophy is to do *all* the checking and return a list of | |
| # errors. | |
| errors = [] | |
| if self.admin_set and not UUIDISH_RE.match(self.admin_set): | |
| errors.append( | |
| f"admin_set is set to `{self.admin_set}`, which doesn't look like a UUID" | |
| ) | |
| if self.user_collection and not UUIDISH_RE.match(self.user_collection): | |
| errors.append( | |
| f"user_collection is set to `{self.user_collection}`, which doesn't look like a UUID" | |
| ) | |
| if self.file_path and not os.path.exists(self.file_path): | |
| errors.append( | |
| f"file_path is set to `{self.file_path}`, which does not exist" | |
| ) | |
| if errors: | |
| raise WorkException("; ".join(errors), errors=errors) | |
| def create_metadata_xml(metadata_dict: dict): | |
| """ | |
| Given a dictionary, create a string that's a valid XML | |
| document that will be accepted by the SWORD upload work API | |
| endpoint. | |
| """ | |
| metadata_elt = lxml.etree.Element("metadata") | |
| # Adds items under `metadata_elt`: | |
| for k, v in metadata_dict.items(): | |
| lxml.etree.SubElement(metadata_elt, k).text = v | |
| return lxml.etree.tostring( | |
| metadata_elt, pretty_print=True, xml_declaration=True, encoding="UTF-8" | |
| ) | |
| # Mapping from the the WorkToUpload attribute (key) to the XML | |
| # metadata field name (value). | |
| METADATA_FIELD_MAPPING = { | |
| "user_collection": "member_of_collection_ids", | |
| "visibility": "visibility", | |
| } | |
| def get_upload_metadata_for_work(work_to_upload): | |
| """ | |
| Takes a `WorkToUpload`, figures out the metadata ditionary, | |
| and then passes that to `create_metadata_xml`. | |
| """ | |
| metadata_attrs = { | |
| "title": work_to_upload.title, | |
| "creator": work_to_upload.creator, | |
| } | |
| # This is a mapping from `WorkToUpload` attribute to upload | |
| # metadata attributes. Only populate attributes when they are true | |
| # per Python. | |
| for attr, tag in METADATA_FIELD_MAPPING.items(): | |
| work_val = getattr(work_to_upload, attr) | |
| if not work_val: | |
| continue | |
| metadata_attrs[tag] = work_val | |
| return create_metadata_xml(metadata_attrs) | |
| class ConnectionException(Exception): | |
| """ | |
| Raised whenever there's an issue talking to the Confluence API. | |
| """ | |
| def __init__(self, content, response): | |
| self.content = content | |
| self.response = response | |
| class NotFoundException(ConnectionException): | |
| pass | |
| def file_upload_tuple(label, path): | |
| """ | |
| Generates a tuple that request's files= parameter will accept. | |
| This calculates the filename and mime type from the path. | |
| As of 2025-12-05 it guesses the mime type only from the extension, | |
| not from file contents. | |
| DANGER: return filehandles need to be closed manually! | |
| """ | |
| # DANGER you need to close the files! | |
| file_name = os.path.basename(path) | |
| # Copilot helped with this: | |
| mime_type, _ = mimetypes.guess_type(file_name) | |
| mime_type = mime_type or "application/octet-stream" | |
| return (label, (file_name, open(path, "rb"), mime_type)) | |
| def wrap_raw(func): | |
| """ | |
| Stole this from oap_o365. | |
| This function wrapper wraps a function that returns a requests | |
| response object. The response object is checked for `resp.ok`. If | |
| it's not OK then it raises an exception. Otherwise it returns the | |
| response. | |
| """ | |
| @wraps(func) | |
| def wrapper(*args, **kwargs): | |
| """ | |
| Call the original function, but check the response object it returns. | |
| If `response.ok` is not true, then rais either a | |
| NotFoundException or a ConnectionException. | |
| """ | |
| resp = func(*args, **kwargs) | |
| if not resp.ok: | |
| logger.error( | |
| "%s --> %s error: %s", resp.url, resp.status_code, resp.content | |
| ) | |
| if resp.status_code == 404: | |
| raise NotFoundException(resp.content, response=resp) | |
| raise ConnectionException( | |
| "Received a {}".format(resp.status_code), response=resp | |
| ) | |
| return resp | |
| return wrapper | |
| def wrap_xml(func): | |
| """ | |
| Take the response content and parse it as XML. | |
| """ | |
| @wraps(func) | |
| def wrapper(*args, **kwargs): | |
| """ | |
| Call the original function, but check the response object it returns. | |
| If `response.ok` is not true, then rais either a | |
| NotFoundException or a ConnectionException. | |
| """ | |
| resp = func(*args, **kwargs) | |
| return lxml.etree.fromstring(resp.content) | |
| return wrapper | |
| class HykuSWORDAPI: | |
| """ | |
| Wrapper for the Hyku SWORD API. | |
| """ | |
| def __init__(self, api_base, api_key): | |
| if not api_base.endswith("/"): | |
| api_base += "/" | |
| self.API_BASE = api_base | |
| self.API_KEY = api_key | |
| self.session = requests.Session() | |
| def headers(self): | |
| return {"Api-key": self.API_KEY} | |
| @wrap_xml | |
| @wrap_raw | |
| def servicedocument(self): | |
| url = self.API_BASE + "service_document" | |
| return self.session.get(url, headers=self.headers()) | |
| @wrap_xml | |
| @wrap_raw | |
| def upload_work(self, work_to_upload: WorkToUpload): | |
| work_to_upload.validate() | |
| url = self.API_BASE + f"collections/{work_to_upload.admin_set}/works/" | |
| headers = self.headers() | |
| headers.update( | |
| { | |
| "On-Behalf-Of": work_to_upload.on_behalf_of, | |
| "Hyrax-Work-Model": work_to_upload.work_type, | |
| "In-Progress": "false", | |
| "Packaging": "http://purl.org/net/sword/package/Binary", | |
| } | |
| ) | |
| # List of files to upload. | |
| files = [ | |
| ( | |
| "metadata", | |
| ( | |
| "metadata.xml", | |
| # Apparently it's OK to use a string: | |
| get_upload_metadata_for_work(work_to_upload), | |
| "application/xml", | |
| ), | |
| ) | |
| ] | |
| if work_to_upload.file_path: | |
| files.append( | |
| file_upload_tuple(label="payload", path=work_to_upload.file_path) | |
| ) | |
| resp = self.session.post(url, headers=headers, files=files) | |
| # Super hack but this ensures the file handle is closed, if it | |
| # was open: | |
| for _, triple in files: | |
| file_obj = triple[1] | |
| if isinstance(file_obj, io.IOBase): | |
| file_obj.close() | |
| return resp |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment