Skip to content

Instantly share code, notes, and snippets.

@cmutel
Created January 7, 2026 20:54
Show Gist options
  • Select an option

  • Save cmutel/f10dc880afc9230a04898e8ea2e001df to your computer and use it in GitHub Desktop.

Select an option

Save cmutel/f10dc880afc9230a04898e8ea2e001df to your computer and use it in GitHub Desktop.
Fix UVEK 2025 database for compliance with EcoSpold 1 XSD
#!/usr/bin/env python3
"""
Fix XML files by:
1. Adding namespace to root ecoSpold element
2. Fixing startDate to include day (use first day of month)
3. Fixing endDate to include day (use last day of month)
4. Removing sourceNumber attribute from source elements
"""
import sys
import os
import glob
import xmltodict
from datetime import datetime
from calendar import monthrange
import argparse
from typing import Any, Dict, List, Optional, Union
def fix_date(date_str: str, use_last_day: bool = False) -> str:
"""
Fix a date string to include year, month, and day.
If day is missing, use first day of month (or last day if use_last_day=True).
Args:
date_str: Date string in format YYYY-MM or YYYY-MM-DD
use_last_day: If True and day is missing, use last day of month
Returns:
Fixed date string in format YYYY-MM-DD
"""
if not date_str:
return date_str
# Try to parse as YYYY-MM-DD first
try:
dt = datetime.strptime(date_str, '%Y-%m-%d')
# Already has day, return as is
return date_str
except ValueError:
pass
# Try to parse as YYYY-MM
try:
dt = datetime.strptime(date_str, '%Y-%m')
year = dt.year
month = dt.month
if use_last_day:
day = monthrange(year, month)[1] # Last day of month
else:
day = 1 # First day of month
return f"{year:04d}-{month:02d}-{day:02d}"
except ValueError:
# Invalid format, return as is
return date_str
def remove_source_number(source: Union[Dict[str, Any], List[Dict[str, Any]]]) -> None:
"""
Remove sourceNumber attribute from the source element at
/ecoSpold/dataset/metaInformation/modellingAndValidation/source.
Only processes the specific source element(s) at this path, not recursively.
Works with both dict and OrderedDict structures from xmltodict.
xmltodict stores attributes with '@' prefix.
Args:
source: Source element dictionary or list of source elements from xmltodict
"""
if isinstance(source, dict) and '@sourceNumber' in source:
del source['@sourceNumber']
elif isinstance(source, list):
# Handle case where source is a list of source elements
for item in source:
if isinstance(item, dict) and '@sourceNumber' in item:
del item['@sourceNumber']
def fix_xml_file(input_path: str, output_path: Optional[str] = None) -> None:
"""
Fix an XML file according to the requirements.
Args:
input_path: Path to input XML file
output_path: Path to output XML file (default: overwrite input)
"""
if output_path is None:
output_path = input_path
# Read XML file using xmltodict
with open(input_path, 'rb') as f:
doc = xmltodict.parse(f, process_namespaces=False)
# Check if root element exists
if 'ecoSpold' not in doc:
print(f"Warning: Root element 'ecoSpold' not found in {input_path}")
return
# 1. Fix namespace: add xmlns attribute to root element if missing
namespace = 'http://www.EcoInvent.org/EcoSpold01'
root_element = doc['ecoSpold']
# Add namespace if not present or if present but different
if '@xmlns' not in root_element:
root_element['@xmlns'] = namespace
elif root_element['@xmlns'] != namespace:
print("Overwriting incorrect XML namespace '{root_element['@xmlns']}' with '{namespace}' in file '{input_path}")
root_element['@xmlns'] = namespace
# 2. Fix startDate
try:
start_date_path = doc['ecoSpold']['dataset']['metaInformation']['processInformation']['timePeriod']['startDate']
if start_date_path:
fixed_start = fix_date(start_date_path, use_last_day=False)
doc['ecoSpold']['dataset']['metaInformation']['processInformation']['timePeriod']['startDate'] = fixed_start
except (KeyError, TypeError) as e:
print(f"Warning: Could not fix startDate in {input_path}: {e}")
# 3. Fix endDate
try:
end_date_path = doc['ecoSpold']['dataset']['metaInformation']['processInformation']['timePeriod']['endDate']
if end_date_path:
fixed_end = fix_date(end_date_path, use_last_day=True)
doc['ecoSpold']['dataset']['metaInformation']['processInformation']['timePeriod']['endDate'] = fixed_end
except (KeyError, TypeError) as e:
print(f"Warning: Could not fix endDate in {input_path}: {e}")
# 4. Remove sourceNumber attribute
try:
source = doc['ecoSpold']['dataset']['metaInformation']['modellingAndValidation']['source']
remove_source_number(source)
except (KeyError, TypeError) as e:
print(f"Warning: Could not remove sourceNumber in {input_path}: {e}")
# Write back to XML using xmltodict
output = xmltodict.unparse(
doc,
pretty=True,
short_empty_elements=True
)
# Write to file
with open(output_path, 'wb') as f:
f.write(output.encode('utf-8'))
print(f"Fixed: {input_path} -> {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description='Fix XML files according to EcoSpold requirements. '
'This tool fixes EcoSpold XML files by: '
'(1) adding the required namespace to the root ecoSpold element, '
'(2) ensuring startDate and endDate have complete dates, '
'(3) removing sourceNumber attributes from source elements.',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog='Examples:\n'
' %(prog)s file.xml # Fix a single file\n'
' %(prog)s file1.xml file2.xml # Fix multiple files\n'
' %(prog)s -d /path/to/directory # Fix all XML files in a directory\n'
' %(prog)s file.xml -o output.xml # Fix file and save to output.xml'
)
parser.add_argument(
'files',
nargs='*',
help='XML file(s) to fix. Can specify multiple files. '
'If -d/--directory is used, this argument is ignored.'
)
parser.add_argument(
'-d', '--directory',
dest='directory',
metavar='DIR',
help='Directory containing XML files to fix. All .xml files in the directory will be processed. '
'Cannot be used together with file arguments.'
)
parser.add_argument(
'-o', '--output',
metavar='FILE',
help='Output file path. Only valid when processing a single input file. '
'If not specified, files are modified in place.'
)
args = parser.parse_args()
# Validate arguments
if args.directory and args.files:
print("Error: Cannot specify both -d/--directory and file arguments", file=sys.stderr)
sys.exit(1)
if not args.directory and not args.files:
parser.error("Either specify file(s) to fix or use -d/--directory")
# Collect files to process
files_to_process: List[str] = []
if args.directory:
# Process directory
if args.output:
print("Error: -o/--output cannot be used with -d/--directory", file=sys.stderr)
sys.exit(1)
if not os.path.isdir(args.directory):
print(f"Error: Directory not found: {args.directory}", file=sys.stderr)
sys.exit(1)
# Find all XML files in directory
xml_pattern = os.path.join(args.directory, '*.xml')
files_to_process = glob.glob(xml_pattern)
if not files_to_process:
print(f"Warning: No XML files found in directory: {args.directory}", file=sys.stderr)
return
print(f"Found {len(files_to_process)} XML file(s) in {args.directory}")
else:
# Process individual files
files_to_process = args.files
if args.output and len(files_to_process) > 1:
print("Error: -o/--output can only be used with a single input file", file=sys.stderr)
sys.exit(1)
# Process files
for file_path in files_to_process:
if not os.path.isfile(file_path):
print(f"Warning: File not found: {file_path}", file=sys.stderr)
continue
try:
fix_xml_file(file_path, args.output if args.output else None)
except Exception as e:
print(f"Error processing {file_path}: {e}", file=sys.stderr)
sys.exit(1)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment