Skip to content

Instantly share code, notes, and snippets.

@Vitexus
Created January 11, 2026 21:25
Show Gist options
  • Select an option

  • Save Vitexus/d782aa64a5283cd12497952064591dae to your computer and use it in GitHub Desktop.

Select an option

Save Vitexus/d782aa64a5283cd12497952064591dae to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Discord Export to DokuWiki Converter
Converts Discord channel exports (HTML) to DokuWiki pages
Use https://github.com/Tyrrrz/DiscordChatExporter to obtain input data
"""
import os
import re
import html
from bs4 import BeautifulSoup
from pathlib import Path
import urllib.parse
class DiscordToDokuWiki:
def sanitize_media_filename(self, filename):
"""Sanitize filename to be valid for DokuWiki media: lowercase, alphanumeric, dash, underscore, no spaces, no special chars."""
import re
name, ext = os.path.splitext(filename)
# Only allow a-z, 0-9, dash, underscore
name = name.lower()
name = re.sub(r'[^a-z0-9_-]', '_', name)
name = re.sub(r'_+', '_', name)
name = name.strip('_-')
ext = ext.lower()
return f"{name}{ext}"
def __init__(self, source_dir, target_dir):
self.source_dir = Path(source_dir)
self.target_dir = Path(target_dir)
def clean_filename(self, text):
"""Convert text to valid DokuWiki page name"""
# Remove HTML entities
text = html.unescape(text)
# Convert to lowercase
text = text.lower()
# Replace spaces and special chars with underscores
text = re.sub(r'[^a-z0-9_-]', '_', text)
# Remove multiple underscores
text = re.sub(r'_+', '_', text)
# Remove leading/trailing underscores
text = text.strip('_')
return text
def parse_discord_filename(self, filename):
"""Parse Discord export filename to extract category and page name"""
# Pattern: "The End land - category - pagename [id].html"
match = re.match(r'The End land - ([^-]+) - ([^[]+) \[(\d+)\]\.html', filename)
if match:
category = match.group(1).strip()
pagename = match.group(2).strip()
discord_id = match.group(3)
return category, pagename, discord_id
return None, None, None
def extract_message_content(self, message_div, html_file=None, media_subdir=None):
"""Extract text content and attachments from a Discord message, including markdown and images. Also copy images to media dir and generate correct DokuWiki links. Adds debug output for image handling."""
content_parts = []
# Extract all markdown content (including multi-line and preserved)
markdowns = message_div.find_all(['div', 'span'], class_=['chatlog__content', 'chatlog__markdown', 'chatlog__markdown-preserve'])
for md in markdowns:
text = md.get_text("\n", strip=True)
if text and text not in content_parts:
content_parts.append(text)
# Find attachments (images, files)
attachments = message_div.find_all('div', class_='chatlog__attachment')
for attachment in attachments:
# Find all images inside the attachment, even if deeply nested
imgs = attachment.find_all('img', class_='chatlog__attachment-media')
if imgs:
print(f"[DEBUG] Found {len(imgs)} image(s) in attachment (file: {html_file})")
for img in imgs:
alt_text = img.get('alt', 'Image')
src = img.get('src', '')
print(f"[DEBUG] Image src: {src}, alt: {alt_text}")
if src and html_file and media_subdir:
from urllib.parse import unquote
decoded_src = unquote(src)
src_path = Path(decoded_src)
if not src_path.is_absolute():
html_dir = Path(html_file).parent
abs_src = html_dir / src_path
else:
abs_src = src_path
print(f"[DEBUG] Resolved abs_src: {abs_src}")
media_dir = self.target_dir / 'media' / media_subdir
media_dir.mkdir(parents=True, exist_ok=True)
# Force filename to lowercase for DokuWiki/media compatibility
sanitized_name = self.sanitize_media_filename(src_path.name)
dest_file = media_dir / sanitized_name
print(f"[DEBUG] Will copy to: {dest_file}")
try:
if abs_src.exists():
print(f"[DEBUG] Image file exists: {abs_src}")
with open(abs_src, 'rb') as fsrc, open(dest_file, 'wb') as fdst:
fdst.write(fsrc.read())
content_parts.append(f"{{{{{media_subdir}:{sanitized_name}}}}}")
print(f"[DEBUG] Added DokuWiki image markup: {{{{{media_subdir}:{sanitized_name}}}}}")
else:
print(f"[WARN] Image file not found: {abs_src}")
except Exception as e:
print(f"[ERROR] Failed to copy image {abs_src}: {e}")
else:
print(f"[DEBUG] No src/html_file/media_subdir for image, using alt text.")
content_parts.append(f"[Image: {alt_text}]")
else:
print(f"[WARN] No image found in attachment for file {html_file}")
# Check for files (but not images)
file_link = attachment.find('a')
if file_link and not attachment.find('img', class_='chatlog__attachment-media'):
filename = file_link.get_text(strip=True)
content_parts.append(f"[File: {filename}]")
# Find embeds (YouTube, Spotify, etc.)
embeds = message_div.find_all('div', class_='chatlog__embed')
for embed in embeds:
title_elem = embed.find('div', class_='chatlog__embed-title')
if title_elem:
title = title_elem.get_text(strip=True)
link_elem = title_elem.find('a')
if link_elem:
url = link_elem.get('href', '')
content_parts.append(f"[[{url}|{title}]]")
else:
content_parts.append(f"**{title}**")
# Remove duplicates while preserving order
seen = set()
unique_parts = []
for part in content_parts:
if part not in seen:
unique_parts.append(part)
seen.add(part)
return '\n\n'.join(unique_parts)
def convert_html_to_dokuwiki(self, html_file):
"""Convert a single Discord HTML file to DokuWiki format, copying images to media dir."""
with open(html_file, 'r', encoding='utf-8') as f:
content = f.read()
soup = BeautifulSoup(content, 'html.parser')
# Extract title from preamble
preamble = soup.find('div', class_='preamble')
title = "Untitled"
if preamble:
entries = preamble.find_all('div', class_='preamble__entry')
if len(entries) >= 2:
title = entries[1].get_text(strip=True)
# Extract messages
messages = soup.find_all('div', class_='chatlog__message-container')
dokuwiki_content = []
dokuwiki_content.append(f"====== {title} ======")
dokuwiki_content.append("")
# Determine media subdir from file location (e.g., npc, main_characters, etc.)
# Use the parent directory name of the output file as media subdir
# Parse category from filename
category, pagename, discord_id = self.parse_discord_filename(Path(html_file).name)
media_subdir = self.clean_filename(category) if category else "media"
for message in messages:
# Extract timestamp
timestamp_elem = message.find('span', class_='chatlog__timestamp')
timestamp = ""
if timestamp_elem:
timestamp_link = timestamp_elem.find('a')
if timestamp_link:
timestamp = timestamp_link.get_text(strip=True)
# Extract author
author_elem = message.find('span', class_='chatlog__author')
author = "Unknown"
if author_elem:
author = author_elem.get_text(strip=True)
# Extract message content, pass html_file and media_subdir for image copying
message_content = self.extract_message_content(message, html_file=html_file, media_subdir=media_subdir)
if message_content:
if timestamp:
dokuwiki_content.append(f"===== {author} - {timestamp} =====")
else:
dokuwiki_content.append(f"===== {author} =====")
dokuwiki_content.append("")
dokuwiki_content.append(message_content)
dokuwiki_content.append("")
return '\n'.join(dokuwiki_content)
def create_directory_structure(self):
"""Create DokuWiki directory structure based on Discord categories"""
categories = set()
for html_file in self.source_dir.glob("*.html"):
category, pagename, discord_id = self.parse_discord_filename(html_file.name)
if category:
categories.add(category)
# Create directories for each category
for category in categories:
clean_category = self.clean_filename(category)
category_dir = self.target_dir / clean_category
category_dir.mkdir(parents=True, exist_ok=True)
print(f"Created directory: {category_dir}")
def convert_all_files(self):
"""Convert all Discord HTML files to DokuWiki pages"""
self.create_directory_structure()
converted_files = []
for html_file in self.source_dir.glob("*.html"):
category, pagename, discord_id = self.parse_discord_filename(html_file.name)
if not category or not pagename:
print(f"Skipping file with unknown format: {html_file.name}")
continue
# Clean names for DokuWiki
clean_category = self.clean_filename(category)
clean_pagename = self.clean_filename(pagename)
# Convert to DokuWiki format
print(f"Converting: {html_file.name}")
dokuwiki_content = self.convert_html_to_dokuwiki(html_file)
# Create output file path
output_dir = self.target_dir / clean_category
output_file = output_dir / f"{clean_pagename}.txt"
# Write DokuWiki file
with open(output_file, 'w', encoding='utf-8') as f:
f.write(dokuwiki_content)
converted_files.append({
'original': html_file.name,
'category': category,
'pagename': pagename,
'output': str(output_file.relative_to(self.target_dir))
})
print(f" -> {output_file}")
return converted_files
def generate_index_page(self, converted_files):
"""Generate a main index page listing all converted pages"""
index_content = []
index_content.append("====== The End Land Wiki ======")
index_content.append("")
index_content.append("Welcome to The End Land wiki. This content was converted from Discord channels.")
index_content.append("")
# Group by category
categories = {}
for file_info in converted_files:
category = file_info['category']
if category not in categories:
categories[category] = []
categories[category].append(file_info)
for category, files in sorted(categories.items()):
clean_category = self.clean_filename(category)
index_content.append(f"===== {category} =====")
index_content.append("")
for file_info in sorted(files, key=lambda x: x['pagename']):
clean_pagename = self.clean_filename(file_info['pagename'])
page_link = f"{clean_category}:{clean_pagename}"
index_content.append(f" * [[{page_link}|{file_info['pagename']}]]")
index_content.append("")
# Write index file
index_file = self.target_dir / "start.txt"
with open(index_file, 'w', encoding='utf-8') as f:
f.write('\n'.join(index_content))
print(f"Created index page: {index_file}")
def main():
import sys
if len(sys.argv) != 3:
print("Usage: python discord_to_dokuwiki.py <source_dir> <target_dir>")
print("Example: python discord_to_dokuwiki.py . /home/grizzly/WWW/theendland.v.s.cz/data/pages")
sys.exit(1)
source_dir = sys.argv[1]
target_dir = sys.argv[2]
converter = DiscordToDokuWiki(source_dir, target_dir)
print("Starting Discord to DokuWiki conversion...")
print(f"Source directory: {source_dir}")
print(f"Target directory: {target_dir}")
print()
converted_files = converter.convert_all_files()
print(f"\nConverted {len(converted_files)} files successfully!")
# Generate index page
converter.generate_index_page(converted_files)
print("\nConversion complete!")
print(f"DokuWiki pages have been created in: {target_dir}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment