Created
January 26, 2026 16:41
-
-
Save emaballarin/b85d7c771823a116e506dfce42280786 to your computer and use it in GitHub Desktop.
Validate an XML document against an XSD schema, handling browser artifacts and remote imports
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """Validate an XML document against an XSD schema, handling browser artifacts and remote imports.""" | |
| import argparse | |
| import re | |
| import shutil | |
| import sys | |
| import tempfile | |
| from pathlib import Path, PurePosixPath | |
| from urllib.parse import urlparse | |
| from urllib.request import urlretrieve | |
| from lxml import etree | |
| XS_NS = "http://www.w3.org/2001/XMLSchema" | |
| DSIG_NS = "http://www.w3.org/2000/09/xmldsig#" | |
| # Matches <script/>, <script xmlns=""/>, and variants injected by browsers. | |
| _SCRIPT_RE = re.compile( | |
| rb"<script\b[^>]*/\s*>|<script\b[^>]*>\s*</script\s*>", | |
| re.IGNORECASE, | |
| ) | |
| # Matches XML digital signature elements: prefixed (e.g. <ds:Signature) or | |
| # unprefixed (<Signature) when the dsig namespace URI is present nearby. | |
| _DSIG_PREFIXED_RE = re.compile( | |
| rb"<\w+:Signature\b[^>]*" + re.escape(DSIG_NS.encode()), | |
| ) | |
| _DSIG_UNPREFIXED_RE = re.compile( | |
| rb"<Signature\b[^>]*" + re.escape(DSIG_NS.encode()), | |
| ) | |
| class SchemaDownloadError(Exception): | |
| """Raised when a remote schema download fails.""" | |
| def strip_script_artifacts(xml_bytes: bytes) -> bytes: | |
| """Remove browser-injected ``<script/>`` elements from raw XML bytes.""" | |
| return _SCRIPT_RE.sub(b"", xml_bytes) | |
| def _has_signature(xml_bytes: bytes) -> bool: | |
| """Return True if *xml_bytes* contains an XML digital signature element.""" | |
| return bool(_DSIG_PREFIXED_RE.search(xml_bytes) or _DSIG_UNPREFIXED_RE.search(xml_bytes)) | |
| def _make_parser() -> etree.XMLParser: | |
| """Return a hardened XML parser with entity resolution and network access disabled.""" | |
| return etree.XMLParser(resolve_entities=False, no_network=True) | |
| def resolve_remote_schemas(xsd_tree: etree._ElementTree, tmp_dir: Path) -> None: | |
| """Download remote schemas referenced by ``xs:import`` and rewrite paths to local copies.""" | |
| for imp in xsd_tree.iter(f"{{{XS_NS}}}import"): | |
| loc = imp.get("schemaLocation", "") | |
| if not loc.startswith(("http://", "https://", "ftp://", "file://")): | |
| # Local path — leave as-is. | |
| continue | |
| # SSRF: only allow HTTPS downloads. | |
| parsed = urlparse(loc) | |
| if parsed.scheme != "https": | |
| raise SchemaDownloadError(f"Refusing to download schema over insecure scheme '{ | |
| parsed.scheme}://': {loc}") | |
| # Derive a safe local filename from the URL path. | |
| filename = PurePosixPath(parsed.path).name or "schema.xsd" | |
| local_name = tmp_dir / filename | |
| if not local_name.exists(): | |
| print(f"Downloading {loc} …", file=sys.stderr) | |
| try: | |
| urlretrieve(loc, local_name) # noqa: S310 | |
| except Exception as exc: | |
| raise SchemaDownloadError(f"Failed to download { | |
| loc}: {exc}") from exc | |
| imp.set("schemaLocation", str(local_name)) | |
| def validate(xml_path: Path, xsd_path: Path) -> tuple[bool, list[str]]: | |
| """Validate *xml_path* against *xsd_path*. Returns ``(is_valid, errors)``.""" | |
| # NOTE: raw bytes are read without explicit encoding detection; the XML | |
| # declaration inside the document is expected to specify the encoding. | |
| raw_xml = xml_path.read_bytes() | |
| xsd_bytes = xsd_path.read_bytes() | |
| # Strip browser artifacts from the XSD. | |
| cleaned_xsd = strip_script_artifacts(xsd_bytes) | |
| if cleaned_xsd != xsd_bytes: | |
| print( | |
| "Notice: XSD contains browser <script/> artifacts; stripping them.", | |
| file=sys.stderr, | |
| ) | |
| # Handle script artifacts in the XML conditionally. | |
| has_artifacts = bool(_SCRIPT_RE.search(raw_xml)) | |
| signed = _has_signature(raw_xml) | |
| if has_artifacts and signed: | |
| print( | |
| "Notice: XML contains browser <script/> artifacts but is digitally signed; " | |
| "leaving XML unchanged to preserve the signature.", | |
| file=sys.stderr, | |
| ) | |
| xml_bytes = raw_xml | |
| elif has_artifacts: | |
| print( | |
| "Notice: XML contains browser <script/> artifacts; stripping them.", | |
| file=sys.stderr, | |
| ) | |
| xml_bytes = strip_script_artifacts(raw_xml) | |
| else: | |
| xml_bytes = raw_xml | |
| parser = _make_parser() | |
| xsd_doc = etree.fromstring(cleaned_xsd, parser=parser) | |
| xsd_tree = etree.ElementTree(xsd_doc) | |
| with tempfile.TemporaryDirectory() as tmp: | |
| tmp_dir = Path(tmp) | |
| resolve_remote_schemas(xsd_tree, tmp_dir) | |
| # Write the (possibly patched) XSD so lxml resolves relative paths correctly. | |
| patched_xsd = tmp_dir / xsd_path.name | |
| xsd_tree.write(patched_xsd, xml_declaration=True, encoding="UTF-8") | |
| schema = etree.XMLSchema(etree.parse(str(patched_xsd), parser=parser)) | |
| # Validate inside the tempdir context so downloaded schemas remain accessible. | |
| xml_doc = etree.fromstring(xml_bytes, parser=parser) | |
| is_valid = schema.validate(xml_doc) | |
| errors = [str(e) for e in schema.error_log] | |
| return is_valid, errors | |
| def main() -> None: | |
| """CLI entry point.""" | |
| parser = argparse.ArgumentParser(description="Validate XML against XSD.") | |
| parser.add_argument("xml", type=Path, help="Path to the XML file") | |
| parser.add_argument("xsd", type=Path, help="Path to the XSD schema file") | |
| parser.add_argument( | |
| "--strip", | |
| action="store_true", | |
| help="Remove browser <script/> artifacts from the XML file in-place (unsigned only)", | |
| ) | |
| args = parser.parse_args() | |
| if not args.xml.is_file(): | |
| print(f"Error: XML file not found: {args.xml}", file=sys.stderr) | |
| sys.exit(2) | |
| if not args.xsd.is_file(): | |
| print(f"Error: XSD file not found: {args.xsd}", file=sys.stderr) | |
| sys.exit(2) | |
| if args.strip: | |
| raw = args.xml.read_bytes() | |
| if not _SCRIPT_RE.search(raw): | |
| print("No <script/> artifacts found in the XML file.", file=sys.stderr) | |
| elif _has_signature(raw): | |
| print( | |
| "Error: XML is digitally signed; refusing to strip artifacts (would invalidate the signature).", | |
| file=sys.stderr, | |
| ) | |
| sys.exit(2) | |
| else: | |
| # Create a backup before modifying the file in-place. | |
| shutil.copy2(args.xml, args.xml.with_suffix( | |
| args.xml.suffix + ".bak")) | |
| cleaned = strip_script_artifacts(raw) | |
| args.xml.write_bytes(cleaned) | |
| print( | |
| f"Stripped <script/> artifacts from {args.xml}", file=sys.stderr) | |
| try: | |
| is_valid, errors = validate(args.xml, args.xsd) | |
| except (etree.Error, OSError, SchemaDownloadError) as exc: | |
| print(f"Error: {exc}", file=sys.stderr) | |
| sys.exit(2) | |
| if is_valid: | |
| print("VALID – the XML document conforms to the schema.") | |
| sys.exit(0) | |
| else: | |
| print("INVALID – validation errors:") | |
| for err in errors: | |
| print(f" • {err}") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment