mbutler/html_to_markdown.py

## html_to_markdown.py
import pymysql
import re
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
import html2text
import warnings
from pathlib import Path

# Suppress BeautifulSoup XMLParsedAsHTMLWarning
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

# CONFIG
DB_NAME = '4e_compendium_backup'
DB_USER = 'mbutler'
DB_PASS = ''
LOG_FILE = 'markdown_conversion_errors.log'

# DB Connection
conn = pymysql.connect(
    host='localhost',
    user=DB_USER,
    password=DB_PASS,
    database=DB_NAME,
    charset='utf8mb4',
    autocommit=True
)
cursor = conn.cursor()

Path(LOG_FILE).write_text('')

def log_error(table, id_val, error):
    with open(LOG_FILE, 'a', encoding='utf8') as f:
        f.write(f"[{table} ID={id_val}] Error: {error}\n")

def has_column(table, column):
    cursor.execute(f"SHOW COLUMNS FROM `{table}` LIKE %s", (column,))
    return cursor.fetchone() is not None

def ensure_markdown_column(table):
    if not has_column(table, 'Markdown'):
        cursor.execute(f"ALTER TABLE `{table}` ADD COLUMN `Markdown` LONGTEXT CHARACTER SET utf8mb4")

def get_target_tables():
    cursor.execute("SHOW TABLES")
    return [row[0] for row in cursor.fetchall() if has_column(row[0], 'Txt')]

def sanitize_html(raw_html):
    soup = BeautifulSoup(raw_html, 'html.parser')

    # Remove script/style
    for tag in soup(['script', 'style']):
        tag.decompose()

    # Remove anonym.to redirect
    for a in soup.find_all('a', href=True):
        if 'anonym.to/?' in a['href']:
            a['href'] = re.sub(r'^https?://anonym\.to/\?+', '', a['href'])

    # Strip all tag attributes
    for tag in soup.find_all(True):
        tag.attrs = {}

    # Remove <title> (we won't use it for markdown anymore)
    if soup.title:
        soup.title.decompose()

    # Extract and replace all <h1> tags
    for h1 in soup.find_all('h1'):
        prefix = ''
        span = h1.find('span')
        if span:
            prefix = span.get_text(strip=True)
            span.decompose()

        main = h1.get_text(strip=True)
        md_header = f"# {main}" if not prefix else f"# {main}\n\n## {prefix}"

        new_tag = soup.new_tag("p")
        new_tag.string = md_header
        h1.replace_with(new_tag)

    body = soup.body or soup
    return str(body)

def clean_markdown(md):
    md = re.sub(r'!\[\]\([^)]*bullet\.gif\)', '', md)
    md = re.sub(r'^\s*#\s*(.*?)\s*##\s*(.*?)$', r'# \1\n\n## \2', md, flags=re.MULTILINE)
    md = re.sub(r'\n{3,}', '\n\n', md)
    md = re.sub(r'[ \t]+$', '', md, flags=re.MULTILINE)
    return md.strip()

def safe_html_to_md(html):
    try:
        sanitized = sanitize_html(html)
        md = html2text.html2text(sanitized)
        return clean_markdown(md)
    except Exception as e:
        return f"```html\n{html.strip()}\n```"

def convert_txt_column_to_markdown(table):
    ensure_markdown_column(table)
    cursor.execute(f"SELECT ID, Txt FROM `{table}` WHERE Txt IS NOT NULL")
    for id_val, html in cursor.fetchall():
        try:
            md = safe_html_to_md(html)
            cursor.execute(f"UPDATE `{table}` SET Markdown=%s WHERE ID=%s", (md, id_val))
        except Exception as e:
            log_error(table, id_val, str(e))

# Main execution
for table in get_target_tables():
    convert_txt_column_to_markdown(table)

print("✅ Markdown conversion complete. See 'markdown_conversion_errors.log' for any issues.")
	import pymysql
	import re
	from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
	import html2text
	import warnings
	from pathlib import Path

	# Suppress BeautifulSoup XMLParsedAsHTMLWarning
	warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

	# CONFIG
	DB_NAME = '4e_compendium_backup'
	DB_USER = 'mbutler'
	DB_PASS = ''
	LOG_FILE = 'markdown_conversion_errors.log'

	# DB Connection
	conn = pymysql.connect(
	host='localhost',
	user=DB_USER,
	password=DB_PASS,
	database=DB_NAME,
	charset='utf8mb4',
	autocommit=True
	)
	cursor = conn.cursor()

	Path(LOG_FILE).write_text('')

	def log_error(table, id_val, error):
	with open(LOG_FILE, 'a', encoding='utf8') as f:
	f.write(f"[{table} ID={id_val}] Error: {error}\n")

	def has_column(table, column):
	cursor.execute(f"SHOW COLUMNS FROM `{table}` LIKE %s", (column,))
	return cursor.fetchone() is not None

	def ensure_markdown_column(table):
	if not has_column(table, 'Markdown'):
	cursor.execute(f"ALTER TABLE `{table}` ADD COLUMN `Markdown` LONGTEXT CHARACTER SET utf8mb4")

	def get_target_tables():
	cursor.execute("SHOW TABLES")
	return [row[0] for row in cursor.fetchall() if has_column(row[0], 'Txt')]

	def sanitize_html(raw_html):
	soup = BeautifulSoup(raw_html, 'html.parser')

	# Remove script/style
	for tag in soup(['script', 'style']):
	tag.decompose()

	# Remove anonym.to redirect
	for a in soup.find_all('a', href=True):
	if 'anonym.to/?' in a['href']:
	a['href'] = re.sub(r'^https?://anonym\.to/\?+', '', a['href'])

	# Strip all tag attributes
	for tag in soup.find_all(True):
	tag.attrs = {}

	# Remove <title> (we won't use it for markdown anymore)
	if soup.title:
	soup.title.decompose()

	# Extract and replace all <h1> tags
	for h1 in soup.find_all('h1'):
	prefix = ''
	span = h1.find('span')
	if span:
	prefix = span.get_text(strip=True)
	span.decompose()

	main = h1.get_text(strip=True)
	md_header = f"# {main}" if not prefix else f"# {main}\n\n## {prefix}"

	new_tag = soup.new_tag("p")
	new_tag.string = md_header
	h1.replace_with(new_tag)

	body = soup.body or soup
	return str(body)

	def clean_markdown(md):
	md = re.sub(r'!\[\]\([^)]*bullet\.gif\)', '', md)
	md = re.sub(r'^\s#\s(.?)\s##\s(.?)$', r'# \1\n\n## \2', md, flags=re.MULTILINE)
	md = re.sub(r'\n{3,}', '\n\n', md)
	md = re.sub(r'[ \t]+$', '', md, flags=re.MULTILINE)
	return md.strip()

	def safe_html_to_md(html):
	try:
	sanitized = sanitize_html(html)
	md = html2text.html2text(sanitized)
	return clean_markdown(md)
	except Exception as e:
	return f"```html\n{html.strip()}\n```"

	def convert_txt_column_to_markdown(table):
	ensure_markdown_column(table)
	cursor.execute(f"SELECT ID, Txt FROM `{table}` WHERE Txt IS NOT NULL")
	for id_val, html in cursor.fetchall():
	try:
	md = safe_html_to_md(html)
	cursor.execute(f"UPDATE `{table}` SET Markdown=%s WHERE ID=%s", (md, id_val))
	except Exception as e:
	log_error(table, id_val, str(e))

	# Main execution
	for table in get_target_tables():
	convert_txt_column_to_markdown(table)

	print("✅ Markdown conversion complete. See 'markdown_conversion_errors.log' for any issues.")
No results found