Created
September 18, 2025 13:54
-
-
Save royce002/11935ca64c50d3b7c5f6a32137b3222c to your computer and use it in GitHub Desktop.
# WordPress Taxonomy Exporter This Python script parses a standard WordPress XML export file (`.xml`) to extract a complete, unique, and alphabetized list of all categories and post tags.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import xml.etree.ElementTree as ET | |
| import os | |
| def extract_categories_tags_from_wordpress_export(xml_file): | |
| """ | |
| Parses a WordPress XML export file to extract all unique categories and tags. | |
| Args: | |
| xml_file (str): The path to the WordPress XML export file. | |
| Returns: | |
| tuple: A tuple containing two lists: (categories, tags) | |
| """ | |
| categories = set() | |
| tags = set() | |
| try: | |
| # Register the XML namespaces to properly parse the file | |
| namespaces = { | |
| 'wp': 'http://wordpress.org/export/1.2/', | |
| } | |
| tree = ET.parse(xml_file) | |
| root = tree.getroot() | |
| # Find all category elements (domain="category") and extract the text content | |
| for cat in root.findall('.//category[@domain="category"]'): | |
| if cat.text: | |
| categories.add(cat.text) | |
| # Find all tag elements (domain="post_tag") and extract the text content | |
| for tag in root.findall('.//category[@domain="post_tag"]'): | |
| if tag.text: | |
| tags.add(tag.text) | |
| return sorted(list(categories)), sorted(list(tags)) | |
| except ET.ParseError as e: | |
| print(f"Error parsing XML file: {e}") | |
| return [], [] | |
| except FileNotFoundError: | |
| print(f"Error: The file '{xml_file}' was not found.") | |
| return [], [] | |
| if __name__ == "__main__": | |
| # The XML file is in the parent directory | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| parent_dir = os.path.dirname(script_dir) | |
| xml_file_path = os.path.join(parent_dir, 'WordPress.2025-09-17.xml') | |
| print(f"Looking for XML file at: {xml_file_path}") | |
| print(f"File exists: {os.path.exists(xml_file_path)}") | |
| extracted_categories, extracted_tags = extract_categories_tags_from_wordpress_export(xml_file_path) | |
| print(f"Found {len(extracted_categories)} categories and {len(extracted_tags)} tags") | |
| # Use the script directory we already calculated | |
| output_file = os.path.join(script_dir, 'blog_categories_tags_output.txt') | |
| # Write output to text file | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| if extracted_categories: | |
| f.write("### Categories ###\n") | |
| for category in extracted_categories: | |
| f.write(f"- {category}\n") | |
| if extracted_tags: | |
| f.write("\n### Tags ###\n") | |
| for tag in extracted_tags: | |
| f.write(f"- {tag}\n") | |
| print(f"Output written to: {output_file}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment