Created
January 7, 2026 20:54
-
-
Save cmutel/f10dc880afc9230a04898e8ea2e001df to your computer and use it in GitHub Desktop.
Fix UVEK 2025 database for compliance with EcoSpold 1 XSD
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Fix XML files by: | |
| 1. Adding namespace to root ecoSpold element | |
| 2. Fixing startDate to include day (use first day of month) | |
| 3. Fixing endDate to include day (use last day of month) | |
| 4. Removing sourceNumber attribute from source elements | |
| """ | |
| import sys | |
| import os | |
| import glob | |
| import xmltodict | |
| from datetime import datetime | |
| from calendar import monthrange | |
| import argparse | |
| from typing import Any, Dict, List, Optional, Union | |
| def fix_date(date_str: str, use_last_day: bool = False) -> str: | |
| """ | |
| Fix a date string to include year, month, and day. | |
| If day is missing, use first day of month (or last day if use_last_day=True). | |
| Args: | |
| date_str: Date string in format YYYY-MM or YYYY-MM-DD | |
| use_last_day: If True and day is missing, use last day of month | |
| Returns: | |
| Fixed date string in format YYYY-MM-DD | |
| """ | |
| if not date_str: | |
| return date_str | |
| # Try to parse as YYYY-MM-DD first | |
| try: | |
| dt = datetime.strptime(date_str, '%Y-%m-%d') | |
| # Already has day, return as is | |
| return date_str | |
| except ValueError: | |
| pass | |
| # Try to parse as YYYY-MM | |
| try: | |
| dt = datetime.strptime(date_str, '%Y-%m') | |
| year = dt.year | |
| month = dt.month | |
| if use_last_day: | |
| day = monthrange(year, month)[1] # Last day of month | |
| else: | |
| day = 1 # First day of month | |
| return f"{year:04d}-{month:02d}-{day:02d}" | |
| except ValueError: | |
| # Invalid format, return as is | |
| return date_str | |
| def remove_source_number(source: Union[Dict[str, Any], List[Dict[str, Any]]]) -> None: | |
| """ | |
| Remove sourceNumber attribute from the source element at | |
| /ecoSpold/dataset/metaInformation/modellingAndValidation/source. | |
| Only processes the specific source element(s) at this path, not recursively. | |
| Works with both dict and OrderedDict structures from xmltodict. | |
| xmltodict stores attributes with '@' prefix. | |
| Args: | |
| source: Source element dictionary or list of source elements from xmltodict | |
| """ | |
| if isinstance(source, dict) and '@sourceNumber' in source: | |
| del source['@sourceNumber'] | |
| elif isinstance(source, list): | |
| # Handle case where source is a list of source elements | |
| for item in source: | |
| if isinstance(item, dict) and '@sourceNumber' in item: | |
| del item['@sourceNumber'] | |
| def fix_xml_file(input_path: str, output_path: Optional[str] = None) -> None: | |
| """ | |
| Fix an XML file according to the requirements. | |
| Args: | |
| input_path: Path to input XML file | |
| output_path: Path to output XML file (default: overwrite input) | |
| """ | |
| if output_path is None: | |
| output_path = input_path | |
| # Read XML file using xmltodict | |
| with open(input_path, 'rb') as f: | |
| doc = xmltodict.parse(f, process_namespaces=False) | |
| # Check if root element exists | |
| if 'ecoSpold' not in doc: | |
| print(f"Warning: Root element 'ecoSpold' not found in {input_path}") | |
| return | |
| # 1. Fix namespace: add xmlns attribute to root element if missing | |
| namespace = 'http://www.EcoInvent.org/EcoSpold01' | |
| root_element = doc['ecoSpold'] | |
| # Add namespace if not present or if present but different | |
| if '@xmlns' not in root_element: | |
| root_element['@xmlns'] = namespace | |
| elif root_element['@xmlns'] != namespace: | |
| print("Overwriting incorrect XML namespace '{root_element['@xmlns']}' with '{namespace}' in file '{input_path}") | |
| root_element['@xmlns'] = namespace | |
| # 2. Fix startDate | |
| try: | |
| start_date_path = doc['ecoSpold']['dataset']['metaInformation']['processInformation']['timePeriod']['startDate'] | |
| if start_date_path: | |
| fixed_start = fix_date(start_date_path, use_last_day=False) | |
| doc['ecoSpold']['dataset']['metaInformation']['processInformation']['timePeriod']['startDate'] = fixed_start | |
| except (KeyError, TypeError) as e: | |
| print(f"Warning: Could not fix startDate in {input_path}: {e}") | |
| # 3. Fix endDate | |
| try: | |
| end_date_path = doc['ecoSpold']['dataset']['metaInformation']['processInformation']['timePeriod']['endDate'] | |
| if end_date_path: | |
| fixed_end = fix_date(end_date_path, use_last_day=True) | |
| doc['ecoSpold']['dataset']['metaInformation']['processInformation']['timePeriod']['endDate'] = fixed_end | |
| except (KeyError, TypeError) as e: | |
| print(f"Warning: Could not fix endDate in {input_path}: {e}") | |
| # 4. Remove sourceNumber attribute | |
| try: | |
| source = doc['ecoSpold']['dataset']['metaInformation']['modellingAndValidation']['source'] | |
| remove_source_number(source) | |
| except (KeyError, TypeError) as e: | |
| print(f"Warning: Could not remove sourceNumber in {input_path}: {e}") | |
| # Write back to XML using xmltodict | |
| output = xmltodict.unparse( | |
| doc, | |
| pretty=True, | |
| short_empty_elements=True | |
| ) | |
| # Write to file | |
| with open(output_path, 'wb') as f: | |
| f.write(output.encode('utf-8')) | |
| print(f"Fixed: {input_path} -> {output_path}") | |
| def main() -> None: | |
| parser = argparse.ArgumentParser( | |
| description='Fix XML files according to EcoSpold requirements. ' | |
| 'This tool fixes EcoSpold XML files by: ' | |
| '(1) adding the required namespace to the root ecoSpold element, ' | |
| '(2) ensuring startDate and endDate have complete dates, ' | |
| '(3) removing sourceNumber attributes from source elements.', | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog='Examples:\n' | |
| ' %(prog)s file.xml # Fix a single file\n' | |
| ' %(prog)s file1.xml file2.xml # Fix multiple files\n' | |
| ' %(prog)s -d /path/to/directory # Fix all XML files in a directory\n' | |
| ' %(prog)s file.xml -o output.xml # Fix file and save to output.xml' | |
| ) | |
| parser.add_argument( | |
| 'files', | |
| nargs='*', | |
| help='XML file(s) to fix. Can specify multiple files. ' | |
| 'If -d/--directory is used, this argument is ignored.' | |
| ) | |
| parser.add_argument( | |
| '-d', '--directory', | |
| dest='directory', | |
| metavar='DIR', | |
| help='Directory containing XML files to fix. All .xml files in the directory will be processed. ' | |
| 'Cannot be used together with file arguments.' | |
| ) | |
| parser.add_argument( | |
| '-o', '--output', | |
| metavar='FILE', | |
| help='Output file path. Only valid when processing a single input file. ' | |
| 'If not specified, files are modified in place.' | |
| ) | |
| args = parser.parse_args() | |
| # Validate arguments | |
| if args.directory and args.files: | |
| print("Error: Cannot specify both -d/--directory and file arguments", file=sys.stderr) | |
| sys.exit(1) | |
| if not args.directory and not args.files: | |
| parser.error("Either specify file(s) to fix or use -d/--directory") | |
| # Collect files to process | |
| files_to_process: List[str] = [] | |
| if args.directory: | |
| # Process directory | |
| if args.output: | |
| print("Error: -o/--output cannot be used with -d/--directory", file=sys.stderr) | |
| sys.exit(1) | |
| if not os.path.isdir(args.directory): | |
| print(f"Error: Directory not found: {args.directory}", file=sys.stderr) | |
| sys.exit(1) | |
| # Find all XML files in directory | |
| xml_pattern = os.path.join(args.directory, '*.xml') | |
| files_to_process = glob.glob(xml_pattern) | |
| if not files_to_process: | |
| print(f"Warning: No XML files found in directory: {args.directory}", file=sys.stderr) | |
| return | |
| print(f"Found {len(files_to_process)} XML file(s) in {args.directory}") | |
| else: | |
| # Process individual files | |
| files_to_process = args.files | |
| if args.output and len(files_to_process) > 1: | |
| print("Error: -o/--output can only be used with a single input file", file=sys.stderr) | |
| sys.exit(1) | |
| # Process files | |
| for file_path in files_to_process: | |
| if not os.path.isfile(file_path): | |
| print(f"Warning: File not found: {file_path}", file=sys.stderr) | |
| continue | |
| try: | |
| fix_xml_file(file_path, args.output if args.output else None) | |
| except Exception as e: | |
| print(f"Error processing {file_path}: {e}", file=sys.stderr) | |
| sys.exit(1) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment