royce002/wp_taxonomy_exporter.py

## wp_taxonomy_exporter.py
import xml.etree.ElementTree as ET
import os

def extract_categories_tags_from_wordpress_export(xml_file):
    """
    Parses a WordPress XML export file to extract all unique categories and tags.

    Args:
        xml_file (str): The path to the WordPress XML export file.

    Returns:
        tuple: A tuple containing two lists: (categories, tags)
    """
    categories = set()
    tags = set()

    try:
        # Register the XML namespaces to properly parse the file
        namespaces = {
            'wp': 'http://wordpress.org/export/1.2/',
        }

        tree = ET.parse(xml_file)
        root = tree.getroot()

        # Find all category elements (domain="category") and extract the text content
        for cat in root.findall('.//category[@domain="category"]'):
            if cat.text:
                categories.add(cat.text)

        # Find all tag elements (domain="post_tag") and extract the text content
        for tag in root.findall('.//category[@domain="post_tag"]'):
            if tag.text:
                tags.add(tag.text)

        return sorted(list(categories)), sorted(list(tags))

    except ET.ParseError as e:
        print(f"Error parsing XML file: {e}")
        return [], []
    except FileNotFoundError:
        print(f"Error: The file '{xml_file}' was not found.")
        return [], []

if __name__ == "__main__":
    # The XML file is in the parent directory
    script_dir = os.path.dirname(os.path.abspath(__file__))
    parent_dir = os.path.dirname(script_dir)
    xml_file_path = os.path.join(parent_dir, 'WordPress.2025-09-17.xml')

    print(f"Looking for XML file at: {xml_file_path}")
    print(f"File exists: {os.path.exists(xml_file_path)}")

    extracted_categories, extracted_tags = extract_categories_tags_from_wordpress_export(xml_file_path)

    print(f"Found {len(extracted_categories)} categories and {len(extracted_tags)} tags")

    # Use the script directory we already calculated
    output_file = os.path.join(script_dir, 'blog_categories_tags_output.txt')

    # Write output to text file
    with open(output_file, 'w', encoding='utf-8') as f:
        if extracted_categories:
            f.write("### Categories ###\n")
            for category in extracted_categories:
                f.write(f"- {category}\n")

        if extracted_tags:
            f.write("\n### Tags ###\n")
            for tag in extracted_tags:
                f.write(f"- {tag}\n")

    print(f"Output written to: {output_file}")
	import xml.etree.ElementTree as ET
	import os

	def extract_categories_tags_from_wordpress_export(xml_file):
	"""
	Parses a WordPress XML export file to extract all unique categories and tags.

	Args:
	xml_file (str): The path to the WordPress XML export file.

	Returns:
	tuple: A tuple containing two lists: (categories, tags)
	"""
	categories = set()
	tags = set()

	try:
	# Register the XML namespaces to properly parse the file
	namespaces = {
	'wp': 'http://wordpress.org/export/1.2/',
	}

	tree = ET.parse(xml_file)
	root = tree.getroot()

	# Find all category elements (domain="category") and extract the text content
	for cat in root.findall('.//category[@domain="category"]'):
	if cat.text:
	categories.add(cat.text)

	# Find all tag elements (domain="post_tag") and extract the text content
	for tag in root.findall('.//category[@domain="post_tag"]'):
	if tag.text:
	tags.add(tag.text)

	return sorted(list(categories)), sorted(list(tags))

	except ET.ParseError as e:
	print(f"Error parsing XML file: {e}")
	return [], []
	except FileNotFoundError:
	print(f"Error: The file '{xml_file}' was not found.")
	return [], []

	if __name__ == "__main__":
	# The XML file is in the parent directory
	script_dir = os.path.dirname(os.path.abspath(__file__))
	parent_dir = os.path.dirname(script_dir)
	xml_file_path = os.path.join(parent_dir, 'WordPress.2025-09-17.xml')

	print(f"Looking for XML file at: {xml_file_path}")
	print(f"File exists: {os.path.exists(xml_file_path)}")

	extracted_categories, extracted_tags = extract_categories_tags_from_wordpress_export(xml_file_path)

	print(f"Found {len(extracted_categories)} categories and {len(extracted_tags)} tags")

	# Use the script directory we already calculated
	output_file = os.path.join(script_dir, 'blog_categories_tags_output.txt')

	# Write output to text file
	with open(output_file, 'w', encoding='utf-8') as f:
	if extracted_categories:
	f.write("### Categories ###\n")
	for category in extracted_categories:
	f.write(f"- {category}\n")

	if extracted_tags:
	f.write("\n### Tags ###\n")
	for tag in extracted_tags:
	f.write(f"- {tag}\n")

	print(f"Output written to: {output_file}")
No results found