Created
January 11, 2026 21:25
-
-
Save Vitexus/d782aa64a5283cd12497952064591dae to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Discord Export to DokuWiki Converter | |
| Converts Discord channel exports (HTML) to DokuWiki pages | |
| Use https://github.com/Tyrrrz/DiscordChatExporter to obtain input data | |
| """ | |
| import os | |
| import re | |
| import html | |
| from bs4 import BeautifulSoup | |
| from pathlib import Path | |
| import urllib.parse | |
| class DiscordToDokuWiki: | |
| def sanitize_media_filename(self, filename): | |
| """Sanitize filename to be valid for DokuWiki media: lowercase, alphanumeric, dash, underscore, no spaces, no special chars.""" | |
| import re | |
| name, ext = os.path.splitext(filename) | |
| # Only allow a-z, 0-9, dash, underscore | |
| name = name.lower() | |
| name = re.sub(r'[^a-z0-9_-]', '_', name) | |
| name = re.sub(r'_+', '_', name) | |
| name = name.strip('_-') | |
| ext = ext.lower() | |
| return f"{name}{ext}" | |
| def __init__(self, source_dir, target_dir): | |
| self.source_dir = Path(source_dir) | |
| self.target_dir = Path(target_dir) | |
| def clean_filename(self, text): | |
| """Convert text to valid DokuWiki page name""" | |
| # Remove HTML entities | |
| text = html.unescape(text) | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Replace spaces and special chars with underscores | |
| text = re.sub(r'[^a-z0-9_-]', '_', text) | |
| # Remove multiple underscores | |
| text = re.sub(r'_+', '_', text) | |
| # Remove leading/trailing underscores | |
| text = text.strip('_') | |
| return text | |
| def parse_discord_filename(self, filename): | |
| """Parse Discord export filename to extract category and page name""" | |
| # Pattern: "The End land - category - pagename [id].html" | |
| match = re.match(r'The End land - ([^-]+) - ([^[]+) \[(\d+)\]\.html', filename) | |
| if match: | |
| category = match.group(1).strip() | |
| pagename = match.group(2).strip() | |
| discord_id = match.group(3) | |
| return category, pagename, discord_id | |
| return None, None, None | |
| def extract_message_content(self, message_div, html_file=None, media_subdir=None): | |
| """Extract text content and attachments from a Discord message, including markdown and images. Also copy images to media dir and generate correct DokuWiki links. Adds debug output for image handling.""" | |
| content_parts = [] | |
| # Extract all markdown content (including multi-line and preserved) | |
| markdowns = message_div.find_all(['div', 'span'], class_=['chatlog__content', 'chatlog__markdown', 'chatlog__markdown-preserve']) | |
| for md in markdowns: | |
| text = md.get_text("\n", strip=True) | |
| if text and text not in content_parts: | |
| content_parts.append(text) | |
| # Find attachments (images, files) | |
| attachments = message_div.find_all('div', class_='chatlog__attachment') | |
| for attachment in attachments: | |
| # Find all images inside the attachment, even if deeply nested | |
| imgs = attachment.find_all('img', class_='chatlog__attachment-media') | |
| if imgs: | |
| print(f"[DEBUG] Found {len(imgs)} image(s) in attachment (file: {html_file})") | |
| for img in imgs: | |
| alt_text = img.get('alt', 'Image') | |
| src = img.get('src', '') | |
| print(f"[DEBUG] Image src: {src}, alt: {alt_text}") | |
| if src and html_file and media_subdir: | |
| from urllib.parse import unquote | |
| decoded_src = unquote(src) | |
| src_path = Path(decoded_src) | |
| if not src_path.is_absolute(): | |
| html_dir = Path(html_file).parent | |
| abs_src = html_dir / src_path | |
| else: | |
| abs_src = src_path | |
| print(f"[DEBUG] Resolved abs_src: {abs_src}") | |
| media_dir = self.target_dir / 'media' / media_subdir | |
| media_dir.mkdir(parents=True, exist_ok=True) | |
| # Force filename to lowercase for DokuWiki/media compatibility | |
| sanitized_name = self.sanitize_media_filename(src_path.name) | |
| dest_file = media_dir / sanitized_name | |
| print(f"[DEBUG] Will copy to: {dest_file}") | |
| try: | |
| if abs_src.exists(): | |
| print(f"[DEBUG] Image file exists: {abs_src}") | |
| with open(abs_src, 'rb') as fsrc, open(dest_file, 'wb') as fdst: | |
| fdst.write(fsrc.read()) | |
| content_parts.append(f"{{{{{media_subdir}:{sanitized_name}}}}}") | |
| print(f"[DEBUG] Added DokuWiki image markup: {{{{{media_subdir}:{sanitized_name}}}}}") | |
| else: | |
| print(f"[WARN] Image file not found: {abs_src}") | |
| except Exception as e: | |
| print(f"[ERROR] Failed to copy image {abs_src}: {e}") | |
| else: | |
| print(f"[DEBUG] No src/html_file/media_subdir for image, using alt text.") | |
| content_parts.append(f"[Image: {alt_text}]") | |
| else: | |
| print(f"[WARN] No image found in attachment for file {html_file}") | |
| # Check for files (but not images) | |
| file_link = attachment.find('a') | |
| if file_link and not attachment.find('img', class_='chatlog__attachment-media'): | |
| filename = file_link.get_text(strip=True) | |
| content_parts.append(f"[File: {filename}]") | |
| # Find embeds (YouTube, Spotify, etc.) | |
| embeds = message_div.find_all('div', class_='chatlog__embed') | |
| for embed in embeds: | |
| title_elem = embed.find('div', class_='chatlog__embed-title') | |
| if title_elem: | |
| title = title_elem.get_text(strip=True) | |
| link_elem = title_elem.find('a') | |
| if link_elem: | |
| url = link_elem.get('href', '') | |
| content_parts.append(f"[[{url}|{title}]]") | |
| else: | |
| content_parts.append(f"**{title}**") | |
| # Remove duplicates while preserving order | |
| seen = set() | |
| unique_parts = [] | |
| for part in content_parts: | |
| if part not in seen: | |
| unique_parts.append(part) | |
| seen.add(part) | |
| return '\n\n'.join(unique_parts) | |
| def convert_html_to_dokuwiki(self, html_file): | |
| """Convert a single Discord HTML file to DokuWiki format, copying images to media dir.""" | |
| with open(html_file, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| soup = BeautifulSoup(content, 'html.parser') | |
| # Extract title from preamble | |
| preamble = soup.find('div', class_='preamble') | |
| title = "Untitled" | |
| if preamble: | |
| entries = preamble.find_all('div', class_='preamble__entry') | |
| if len(entries) >= 2: | |
| title = entries[1].get_text(strip=True) | |
| # Extract messages | |
| messages = soup.find_all('div', class_='chatlog__message-container') | |
| dokuwiki_content = [] | |
| dokuwiki_content.append(f"====== {title} ======") | |
| dokuwiki_content.append("") | |
| # Determine media subdir from file location (e.g., npc, main_characters, etc.) | |
| # Use the parent directory name of the output file as media subdir | |
| # Parse category from filename | |
| category, pagename, discord_id = self.parse_discord_filename(Path(html_file).name) | |
| media_subdir = self.clean_filename(category) if category else "media" | |
| for message in messages: | |
| # Extract timestamp | |
| timestamp_elem = message.find('span', class_='chatlog__timestamp') | |
| timestamp = "" | |
| if timestamp_elem: | |
| timestamp_link = timestamp_elem.find('a') | |
| if timestamp_link: | |
| timestamp = timestamp_link.get_text(strip=True) | |
| # Extract author | |
| author_elem = message.find('span', class_='chatlog__author') | |
| author = "Unknown" | |
| if author_elem: | |
| author = author_elem.get_text(strip=True) | |
| # Extract message content, pass html_file and media_subdir for image copying | |
| message_content = self.extract_message_content(message, html_file=html_file, media_subdir=media_subdir) | |
| if message_content: | |
| if timestamp: | |
| dokuwiki_content.append(f"===== {author} - {timestamp} =====") | |
| else: | |
| dokuwiki_content.append(f"===== {author} =====") | |
| dokuwiki_content.append("") | |
| dokuwiki_content.append(message_content) | |
| dokuwiki_content.append("") | |
| return '\n'.join(dokuwiki_content) | |
| def create_directory_structure(self): | |
| """Create DokuWiki directory structure based on Discord categories""" | |
| categories = set() | |
| for html_file in self.source_dir.glob("*.html"): | |
| category, pagename, discord_id = self.parse_discord_filename(html_file.name) | |
| if category: | |
| categories.add(category) | |
| # Create directories for each category | |
| for category in categories: | |
| clean_category = self.clean_filename(category) | |
| category_dir = self.target_dir / clean_category | |
| category_dir.mkdir(parents=True, exist_ok=True) | |
| print(f"Created directory: {category_dir}") | |
| def convert_all_files(self): | |
| """Convert all Discord HTML files to DokuWiki pages""" | |
| self.create_directory_structure() | |
| converted_files = [] | |
| for html_file in self.source_dir.glob("*.html"): | |
| category, pagename, discord_id = self.parse_discord_filename(html_file.name) | |
| if not category or not pagename: | |
| print(f"Skipping file with unknown format: {html_file.name}") | |
| continue | |
| # Clean names for DokuWiki | |
| clean_category = self.clean_filename(category) | |
| clean_pagename = self.clean_filename(pagename) | |
| # Convert to DokuWiki format | |
| print(f"Converting: {html_file.name}") | |
| dokuwiki_content = self.convert_html_to_dokuwiki(html_file) | |
| # Create output file path | |
| output_dir = self.target_dir / clean_category | |
| output_file = output_dir / f"{clean_pagename}.txt" | |
| # Write DokuWiki file | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| f.write(dokuwiki_content) | |
| converted_files.append({ | |
| 'original': html_file.name, | |
| 'category': category, | |
| 'pagename': pagename, | |
| 'output': str(output_file.relative_to(self.target_dir)) | |
| }) | |
| print(f" -> {output_file}") | |
| return converted_files | |
| def generate_index_page(self, converted_files): | |
| """Generate a main index page listing all converted pages""" | |
| index_content = [] | |
| index_content.append("====== The End Land Wiki ======") | |
| index_content.append("") | |
| index_content.append("Welcome to The End Land wiki. This content was converted from Discord channels.") | |
| index_content.append("") | |
| # Group by category | |
| categories = {} | |
| for file_info in converted_files: | |
| category = file_info['category'] | |
| if category not in categories: | |
| categories[category] = [] | |
| categories[category].append(file_info) | |
| for category, files in sorted(categories.items()): | |
| clean_category = self.clean_filename(category) | |
| index_content.append(f"===== {category} =====") | |
| index_content.append("") | |
| for file_info in sorted(files, key=lambda x: x['pagename']): | |
| clean_pagename = self.clean_filename(file_info['pagename']) | |
| page_link = f"{clean_category}:{clean_pagename}" | |
| index_content.append(f" * [[{page_link}|{file_info['pagename']}]]") | |
| index_content.append("") | |
| # Write index file | |
| index_file = self.target_dir / "start.txt" | |
| with open(index_file, 'w', encoding='utf-8') as f: | |
| f.write('\n'.join(index_content)) | |
| print(f"Created index page: {index_file}") | |
| def main(): | |
| import sys | |
| if len(sys.argv) != 3: | |
| print("Usage: python discord_to_dokuwiki.py <source_dir> <target_dir>") | |
| print("Example: python discord_to_dokuwiki.py . /home/grizzly/WWW/theendland.v.s.cz/data/pages") | |
| sys.exit(1) | |
| source_dir = sys.argv[1] | |
| target_dir = sys.argv[2] | |
| converter = DiscordToDokuWiki(source_dir, target_dir) | |
| print("Starting Discord to DokuWiki conversion...") | |
| print(f"Source directory: {source_dir}") | |
| print(f"Target directory: {target_dir}") | |
| print() | |
| converted_files = converter.convert_all_files() | |
| print(f"\nConverted {len(converted_files)} files successfully!") | |
| # Generate index page | |
| converter.generate_index_page(converted_files) | |
| print("\nConversion complete!") | |
| print(f"DokuWiki pages have been created in: {target_dir}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment