Vitexus/discord_to_dokuwiki.py

## discord_to_dokuwiki.py
#!/usr/bin/env python3
"""
Discord Export to DokuWiki Converter
Converts Discord channel exports (HTML) to DokuWiki pages

Use https://github.com/Tyrrrz/DiscordChatExporter to obtain input data

"""

import os
import re
import html
from bs4 import BeautifulSoup
from pathlib import Path
import urllib.parse

class DiscordToDokuWiki:
    def sanitize_media_filename(self, filename):
        """Sanitize filename to be valid for DokuWiki media: lowercase, alphanumeric, dash, underscore, no spaces, no special chars."""
        import re
        name, ext = os.path.splitext(filename)
        # Only allow a-z, 0-9, dash, underscore
        name = name.lower()
        name = re.sub(r'[^a-z0-9_-]', '_', name)
        name = re.sub(r'_+', '_', name)
        name = name.strip('_-')
        ext = ext.lower()
        return f"{name}{ext}"
    def __init__(self, source_dir, target_dir):
        self.source_dir = Path(source_dir)
        self.target_dir = Path(target_dir)

    def clean_filename(self, text):
        """Convert text to valid DokuWiki page name"""
        # Remove HTML entities
        text = html.unescape(text)
        # Convert to lowercase
        text = text.lower()
        # Replace spaces and special chars with underscores
        text = re.sub(r'[^a-z0-9_-]', '_', text)
        # Remove multiple underscores
        text = re.sub(r'_+', '_', text)
        # Remove leading/trailing underscores
        text = text.strip('_')
        return text

    def parse_discord_filename(self, filename):
        """Parse Discord export filename to extract category and page name"""
        # Pattern: "The End land - category - pagename [id].html"
        match = re.match(r'The End land - ([^-]+) - ([^[]+) \[(\d+)\]\.html', filename)
        if match:
            category = match.group(1).strip()
            pagename = match.group(2).strip()
            discord_id = match.group(3)
            return category, pagename, discord_id
        return None, None, None


    def extract_message_content(self, message_div, html_file=None, media_subdir=None):
        """Extract text content and attachments from a Discord message, including markdown and images. Also copy images to media dir and generate correct DokuWiki links. Adds debug output for image handling."""
        content_parts = []

        # Extract all markdown content (including multi-line and preserved)
        markdowns = message_div.find_all(['div', 'span'], class_=['chatlog__content', 'chatlog__markdown', 'chatlog__markdown-preserve'])
        for md in markdowns:
            text = md.get_text("\n", strip=True)
            if text and text not in content_parts:
                content_parts.append(text)

        # Find attachments (images, files)
        attachments = message_div.find_all('div', class_='chatlog__attachment')
        for attachment in attachments:
            # Find all images inside the attachment, even if deeply nested
            imgs = attachment.find_all('img', class_='chatlog__attachment-media')
            if imgs:
                print(f"[DEBUG] Found {len(imgs)} image(s) in attachment (file: {html_file})")
                for img in imgs:
                    alt_text = img.get('alt', 'Image')
                    src = img.get('src', '')
                    print(f"[DEBUG] Image src: {src}, alt: {alt_text}")
                    if src and html_file and media_subdir:
                        from urllib.parse import unquote
                        decoded_src = unquote(src)
                        src_path = Path(decoded_src)
                        if not src_path.is_absolute():
                            html_dir = Path(html_file).parent
                            abs_src = html_dir / src_path
                        else:
                            abs_src = src_path
                        print(f"[DEBUG] Resolved abs_src: {abs_src}")
                        media_dir = self.target_dir / 'media' / media_subdir
                        media_dir.mkdir(parents=True, exist_ok=True)
                        # Force filename to lowercase for DokuWiki/media compatibility
                        sanitized_name = self.sanitize_media_filename(src_path.name)
                        dest_file = media_dir / sanitized_name
                        print(f"[DEBUG] Will copy to: {dest_file}")
                        try:
                            if abs_src.exists():
                                print(f"[DEBUG] Image file exists: {abs_src}")
                                with open(abs_src, 'rb') as fsrc, open(dest_file, 'wb') as fdst:
                                    fdst.write(fsrc.read())
                                content_parts.append(f"{{{{{media_subdir}:{sanitized_name}}}}}")
                                print(f"[DEBUG] Added DokuWiki image markup: {{{{{media_subdir}:{sanitized_name}}}}}")
                            else:
                                print(f"[WARN] Image file not found: {abs_src}")
                        except Exception as e:
                            print(f"[ERROR] Failed to copy image {abs_src}: {e}")
                    else:
                        print(f"[DEBUG] No src/html_file/media_subdir for image, using alt text.")
                        content_parts.append(f"[Image: {alt_text}]")
            else:
                print(f"[WARN] No image found in attachment for file {html_file}")

            # Check for files (but not images)
            file_link = attachment.find('a')
            if file_link and not attachment.find('img', class_='chatlog__attachment-media'):
                filename = file_link.get_text(strip=True)
                content_parts.append(f"[File: {filename}]")

        # Find embeds (YouTube, Spotify, etc.)
        embeds = message_div.find_all('div', class_='chatlog__embed')
        for embed in embeds:
            title_elem = embed.find('div', class_='chatlog__embed-title')
            if title_elem:
                title = title_elem.get_text(strip=True)
                link_elem = title_elem.find('a')
                if link_elem:
                    url = link_elem.get('href', '')
                    content_parts.append(f"[[{url}|{title}]]")
                else:
                    content_parts.append(f"**{title}**")

        # Remove duplicates while preserving order
        seen = set()
        unique_parts = []
        for part in content_parts:
            if part not in seen:
                unique_parts.append(part)
                seen.add(part)

        return '\n\n'.join(unique_parts)

    def convert_html_to_dokuwiki(self, html_file):
        """Convert a single Discord HTML file to DokuWiki format, copying images to media dir."""
        with open(html_file, 'r', encoding='utf-8') as f:
            content = f.read()

        soup = BeautifulSoup(content, 'html.parser')

        # Extract title from preamble
        preamble = soup.find('div', class_='preamble')
        title = "Untitled"
        if preamble:
            entries = preamble.find_all('div', class_='preamble__entry')
            if len(entries) >= 2:
                title = entries[1].get_text(strip=True)

        # Extract messages
        messages = soup.find_all('div', class_='chatlog__message-container')

        dokuwiki_content = []
        dokuwiki_content.append(f"====== {title} ======")
        dokuwiki_content.append("")

        # Determine media subdir from file location (e.g., npc, main_characters, etc.)
        # Use the parent directory name of the output file as media subdir
        # Parse category from filename
        category, pagename, discord_id = self.parse_discord_filename(Path(html_file).name)
        media_subdir = self.clean_filename(category) if category else "media"

        for message in messages:
            # Extract timestamp
            timestamp_elem = message.find('span', class_='chatlog__timestamp')
            timestamp = ""
            if timestamp_elem:
                timestamp_link = timestamp_elem.find('a')
                if timestamp_link:
                    timestamp = timestamp_link.get_text(strip=True)

            # Extract author
            author_elem = message.find('span', class_='chatlog__author')
            author = "Unknown"
            if author_elem:
                author = author_elem.get_text(strip=True)

            # Extract message content, pass html_file and media_subdir for image copying
            message_content = self.extract_message_content(message, html_file=html_file, media_subdir=media_subdir)

            if message_content:
                if timestamp:
                    dokuwiki_content.append(f"===== {author} - {timestamp} =====")
                else:
                    dokuwiki_content.append(f"===== {author} =====")
                dokuwiki_content.append("")
                dokuwiki_content.append(message_content)
                dokuwiki_content.append("")

        return '\n'.join(dokuwiki_content)

    def create_directory_structure(self):
        """Create DokuWiki directory structure based on Discord categories"""
        categories = set()

        for html_file in self.source_dir.glob("*.html"):
            category, pagename, discord_id = self.parse_discord_filename(html_file.name)
            if category:
                categories.add(category)

        # Create directories for each category
        for category in categories:
            clean_category = self.clean_filename(category)
            category_dir = self.target_dir / clean_category
            category_dir.mkdir(parents=True, exist_ok=True)
            print(f"Created directory: {category_dir}")

    def convert_all_files(self):
        """Convert all Discord HTML files to DokuWiki pages"""
        self.create_directory_structure()

        converted_files = []

        for html_file in self.source_dir.glob("*.html"):
            category, pagename, discord_id = self.parse_discord_filename(html_file.name)

            if not category or not pagename:
                print(f"Skipping file with unknown format: {html_file.name}")
                continue

            # Clean names for DokuWiki
            clean_category = self.clean_filename(category)
            clean_pagename = self.clean_filename(pagename)

            # Convert to DokuWiki format
            print(f"Converting: {html_file.name}")
            dokuwiki_content = self.convert_html_to_dokuwiki(html_file)

            # Create output file path
            output_dir = self.target_dir / clean_category
            output_file = output_dir / f"{clean_pagename}.txt"

            # Write DokuWiki file
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write(dokuwiki_content)

            converted_files.append({
                'original': html_file.name,
                'category': category,
                'pagename': pagename,
                'output': str(output_file.relative_to(self.target_dir))
            })

            print(f"  -> {output_file}")

        return converted_files

    def generate_index_page(self, converted_files):
        """Generate a main index page listing all converted pages"""
        index_content = []
        index_content.append("====== The End Land Wiki ======")
        index_content.append("")
        index_content.append("Welcome to The End Land wiki. This content was converted from Discord channels.")
        index_content.append("")

        # Group by category
        categories = {}
        for file_info in converted_files:
            category = file_info['category']
            if category not in categories:
                categories[category] = []
            categories[category].append(file_info)

        for category, files in sorted(categories.items()):
            clean_category = self.clean_filename(category)
            index_content.append(f"===== {category} =====")
            index_content.append("")

            for file_info in sorted(files, key=lambda x: x['pagename']):
                clean_pagename = self.clean_filename(file_info['pagename'])
                page_link = f"{clean_category}:{clean_pagename}"
                index_content.append(f"  * [[{page_link}|{file_info['pagename']}]]")

            index_content.append("")

        # Write index file
        index_file = self.target_dir / "start.txt"
        with open(index_file, 'w', encoding='utf-8') as f:
            f.write('\n'.join(index_content))

        print(f"Created index page: {index_file}")

def main():
    import sys

    if len(sys.argv) != 3:
        print("Usage: python discord_to_dokuwiki.py <source_dir> <target_dir>")
        print("Example: python discord_to_dokuwiki.py . /home/grizzly/WWW/theendland.v.s.cz/data/pages")
        sys.exit(1)

    source_dir = sys.argv[1]
    target_dir = sys.argv[2]

    converter = DiscordToDokuWiki(source_dir, target_dir)

    print("Starting Discord to DokuWiki conversion...")
    print(f"Source directory: {source_dir}")
    print(f"Target directory: {target_dir}")
    print()

    converted_files = converter.convert_all_files()

    print(f"\nConverted {len(converted_files)} files successfully!")

    # Generate index page
    converter.generate_index_page(converted_files)

    print("\nConversion complete!")
    print(f"DokuWiki pages have been created in: {target_dir}")

if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Discord Export to DokuWiki Converter
	Converts Discord channel exports (HTML) to DokuWiki pages

	Use https://github.com/Tyrrrz/DiscordChatExporter to obtain input data

	"""

	import os
	import re
	import html
	from bs4 import BeautifulSoup
	from pathlib import Path
	import urllib.parse

	class DiscordToDokuWiki:
	def sanitize_media_filename(self, filename):
	"""Sanitize filename to be valid for DokuWiki media: lowercase, alphanumeric, dash, underscore, no spaces, no special chars."""
	import re
	name, ext = os.path.splitext(filename)
	# Only allow a-z, 0-9, dash, underscore
	name = name.lower()
	name = re.sub(r'[^a-z0-9_-]', '_', name)
	name = re.sub(r'_+', '_', name)
	name = name.strip('_-')
	ext = ext.lower()
	return f"{name}{ext}"
	def __init__(self, source_dir, target_dir):
	self.source_dir = Path(source_dir)
	self.target_dir = Path(target_dir)

	def clean_filename(self, text):
	"""Convert text to valid DokuWiki page name"""
	# Remove HTML entities
	text = html.unescape(text)
	# Convert to lowercase
	text = text.lower()
	# Replace spaces and special chars with underscores
	text = re.sub(r'[^a-z0-9_-]', '_', text)
	# Remove multiple underscores
	text = re.sub(r'_+', '_', text)
	# Remove leading/trailing underscores
	text = text.strip('_')
	return text

	def parse_discord_filename(self, filename):
	"""Parse Discord export filename to extract category and page name"""
	# Pattern: "The End land - category - pagename [id].html"
	match = re.match(r'The End land - ([^-]+) - ([^[]+) \[(\d+)\]\.html', filename)
	if match:
	category = match.group(1).strip()
	pagename = match.group(2).strip()
	discord_id = match.group(3)
	return category, pagename, discord_id
	return None, None, None


	def extract_message_content(self, message_div, html_file=None, media_subdir=None):
	"""Extract text content and attachments from a Discord message, including markdown and images. Also copy images to media dir and generate correct DokuWiki links. Adds debug output for image handling."""
	content_parts = []

	# Extract all markdown content (including multi-line and preserved)
	markdowns = message_div.find_all(['div', 'span'], class_=['chatlog__content', 'chatlog__markdown', 'chatlog__markdown-preserve'])
	for md in markdowns:
	text = md.get_text("\n", strip=True)
	if text and text not in content_parts:
	content_parts.append(text)

	# Find attachments (images, files)
	attachments = message_div.find_all('div', class_='chatlog__attachment')
	for attachment in attachments:
	# Find all images inside the attachment, even if deeply nested
	imgs = attachment.find_all('img', class_='chatlog__attachment-media')
	if imgs:
	print(f"[DEBUG] Found {len(imgs)} image(s) in attachment (file: {html_file})")
	for img in imgs:
	alt_text = img.get('alt', 'Image')
	src = img.get('src', '')
	print(f"[DEBUG] Image src: {src}, alt: {alt_text}")
	if src and html_file and media_subdir:
	from urllib.parse import unquote
	decoded_src = unquote(src)
	src_path = Path(decoded_src)
	if not src_path.is_absolute():
	html_dir = Path(html_file).parent
	abs_src = html_dir / src_path
	else:
	abs_src = src_path
	print(f"[DEBUG] Resolved abs_src: {abs_src}")
	media_dir = self.target_dir / 'media' / media_subdir
	media_dir.mkdir(parents=True, exist_ok=True)
	# Force filename to lowercase for DokuWiki/media compatibility
	sanitized_name = self.sanitize_media_filename(src_path.name)
	dest_file = media_dir / sanitized_name
	print(f"[DEBUG] Will copy to: {dest_file}")
	try:
	if abs_src.exists():
	print(f"[DEBUG] Image file exists: {abs_src}")
	with open(abs_src, 'rb') as fsrc, open(dest_file, 'wb') as fdst:
	fdst.write(fsrc.read())
	content_parts.append(f"{{{{{media_subdir}:{sanitized_name}}}}}")
	print(f"[DEBUG] Added DokuWiki image markup: {{{{{media_subdir}:{sanitized_name}}}}}")
	else:
	print(f"[WARN] Image file not found: {abs_src}")
	except Exception as e:
	print(f"[ERROR] Failed to copy image {abs_src}: {e}")
	else:
	print(f"[DEBUG] No src/html_file/media_subdir for image, using alt text.")
	content_parts.append(f"[Image: {alt_text}]")
	else:
	print(f"[WARN] No image found in attachment for file {html_file}")

	# Check for files (but not images)
	file_link = attachment.find('a')
	if file_link and not attachment.find('img', class_='chatlog__attachment-media'):
	filename = file_link.get_text(strip=True)
	content_parts.append(f"[File: {filename}]")

	# Find embeds (YouTube, Spotify, etc.)
	embeds = message_div.find_all('div', class_='chatlog__embed')
	for embed in embeds:
	title_elem = embed.find('div', class_='chatlog__embed-title')
	if title_elem:
	title = title_elem.get_text(strip=True)
	link_elem = title_elem.find('a')
	if link_elem:
	url = link_elem.get('href', '')
	content_parts.append(f"[[{url}\|{title}]]")
	else:
	content_parts.append(f"{title}")

	# Remove duplicates while preserving order
	seen = set()
	unique_parts = []
	for part in content_parts:
	if part not in seen:
	unique_parts.append(part)
	seen.add(part)

	return '\n\n'.join(unique_parts)

	def convert_html_to_dokuwiki(self, html_file):
	"""Convert a single Discord HTML file to DokuWiki format, copying images to media dir."""
	with open(html_file, 'r', encoding='utf-8') as f:
	content = f.read()

	soup = BeautifulSoup(content, 'html.parser')

	# Extract title from preamble
	preamble = soup.find('div', class_='preamble')
	title = "Untitled"
	if preamble:
	entries = preamble.find_all('div', class_='preamble__entry')
	if len(entries) >= 2:
	title = entries[1].get_text(strip=True)

	# Extract messages
	messages = soup.find_all('div', class_='chatlog__message-container')

	dokuwiki_content = []
	dokuwiki_content.append(f"====== {title} ======")
	dokuwiki_content.append("")

	# Determine media subdir from file location (e.g., npc, main_characters, etc.)
	# Use the parent directory name of the output file as media subdir
	# Parse category from filename
	category, pagename, discord_id = self.parse_discord_filename(Path(html_file).name)
	media_subdir = self.clean_filename(category) if category else "media"

	for message in messages:
	# Extract timestamp
	timestamp_elem = message.find('span', class_='chatlog__timestamp')
	timestamp = ""
	if timestamp_elem:
	timestamp_link = timestamp_elem.find('a')
	if timestamp_link:
	timestamp = timestamp_link.get_text(strip=True)

	# Extract author
	author_elem = message.find('span', class_='chatlog__author')
	author = "Unknown"
	if author_elem:
	author = author_elem.get_text(strip=True)

	# Extract message content, pass html_file and media_subdir for image copying
	message_content = self.extract_message_content(message, html_file=html_file, media_subdir=media_subdir)

	if message_content:
	if timestamp:
	dokuwiki_content.append(f"===== {author} - {timestamp} =====")
	else:
	dokuwiki_content.append(f"===== {author} =====")
	dokuwiki_content.append("")
	dokuwiki_content.append(message_content)
	dokuwiki_content.append("")

	return '\n'.join(dokuwiki_content)

	def create_directory_structure(self):
	"""Create DokuWiki directory structure based on Discord categories"""
	categories = set()

	for html_file in self.source_dir.glob("*.html"):
	category, pagename, discord_id = self.parse_discord_filename(html_file.name)
	if category:
	categories.add(category)

	# Create directories for each category
	for category in categories:
	clean_category = self.clean_filename(category)
	category_dir = self.target_dir / clean_category
	category_dir.mkdir(parents=True, exist_ok=True)
	print(f"Created directory: {category_dir}")

	def convert_all_files(self):
	"""Convert all Discord HTML files to DokuWiki pages"""
	self.create_directory_structure()

	converted_files = []

	for html_file in self.source_dir.glob("*.html"):
	category, pagename, discord_id = self.parse_discord_filename(html_file.name)

	if not category or not pagename:
	print(f"Skipping file with unknown format: {html_file.name}")
	continue

	# Clean names for DokuWiki
	clean_category = self.clean_filename(category)
	clean_pagename = self.clean_filename(pagename)

	# Convert to DokuWiki format
	print(f"Converting: {html_file.name}")
	dokuwiki_content = self.convert_html_to_dokuwiki(html_file)

	# Create output file path
	output_dir = self.target_dir / clean_category
	output_file = output_dir / f"{clean_pagename}.txt"

	# Write DokuWiki file
	with open(output_file, 'w', encoding='utf-8') as f:
	f.write(dokuwiki_content)

	converted_files.append({
	'original': html_file.name,
	'category': category,
	'pagename': pagename,
	'output': str(output_file.relative_to(self.target_dir))
	})

	print(f" -> {output_file}")

	return converted_files

	def generate_index_page(self, converted_files):
	"""Generate a main index page listing all converted pages"""
	index_content = []
	index_content.append("====== The End Land Wiki ======")
	index_content.append("")
	index_content.append("Welcome to The End Land wiki. This content was converted from Discord channels.")
	index_content.append("")

	# Group by category
	categories = {}
	for file_info in converted_files:
	category = file_info['category']
	if category not in categories:
	categories[category] = []
	categories[category].append(file_info)

	for category, files in sorted(categories.items()):
	clean_category = self.clean_filename(category)
	index_content.append(f"===== {category} =====")
	index_content.append("")

	for file_info in sorted(files, key=lambda x: x['pagename']):
	clean_pagename = self.clean_filename(file_info['pagename'])
	page_link = f"{clean_category}:{clean_pagename}"
	index_content.append(f" * [[{page_link}\|{file_info['pagename']}]]")

	index_content.append("")

	# Write index file
	index_file = self.target_dir / "start.txt"
	with open(index_file, 'w', encoding='utf-8') as f:
	f.write('\n'.join(index_content))

	print(f"Created index page: {index_file}")

	def main():
	import sys

	if len(sys.argv) != 3:
	print("Usage: python discord_to_dokuwiki.py <source_dir> <target_dir>")
	print("Example: python discord_to_dokuwiki.py . /home/grizzly/WWW/theendland.v.s.cz/data/pages")
	sys.exit(1)

	source_dir = sys.argv[1]
	target_dir = sys.argv[2]

	converter = DiscordToDokuWiki(source_dir, target_dir)

	print("Starting Discord to DokuWiki conversion...")
	print(f"Source directory: {source_dir}")
	print(f"Target directory: {target_dir}")
	print()

	converted_files = converter.convert_all_files()

	print(f"\nConverted {len(converted_files)} files successfully!")

	# Generate index page
	converter.generate_index_page(converted_files)

	print("\nConversion complete!")
	print(f"DokuWiki pages have been created in: {target_dir}")

	if __name__ == "__main__":
	main()
No results found