Skip to content

Instantly share code, notes, and snippets.

@mbutler
Last active July 7, 2025 16:19
Show Gist options
  • Select an option

  • Save mbutler/e4c9b2ef00f64bb1a66332a469a309aa to your computer and use it in GitHub Desktop.

Select an option

Save mbutler/e4c9b2ef00f64bb1a66332a469a309aa to your computer and use it in GitHub Desktop.
convert html to markdown
import pymysql
import re
from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning
import html2text
import warnings
from pathlib import Path
# Suppress BeautifulSoup XMLParsedAsHTMLWarning
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
# CONFIG
DB_NAME = '4e_compendium_backup'
DB_USER = 'mbutler'
DB_PASS = ''
LOG_FILE = 'markdown_conversion_errors.log'
# DB Connection
conn = pymysql.connect(
host='localhost',
user=DB_USER,
password=DB_PASS,
database=DB_NAME,
charset='utf8mb4',
autocommit=True
)
cursor = conn.cursor()
Path(LOG_FILE).write_text('')
def log_error(table, id_val, error):
with open(LOG_FILE, 'a', encoding='utf8') as f:
f.write(f"[{table} ID={id_val}] Error: {error}\n")
def has_column(table, column):
cursor.execute(f"SHOW COLUMNS FROM `{table}` LIKE %s", (column,))
return cursor.fetchone() is not None
def ensure_markdown_column(table):
if not has_column(table, 'Markdown'):
cursor.execute(f"ALTER TABLE `{table}` ADD COLUMN `Markdown` LONGTEXT CHARACTER SET utf8mb4")
def get_target_tables():
cursor.execute("SHOW TABLES")
return [row[0] for row in cursor.fetchall() if has_column(row[0], 'Txt')]
def sanitize_html(raw_html):
soup = BeautifulSoup(raw_html, 'html.parser')
# Remove script/style
for tag in soup(['script', 'style']):
tag.decompose()
# Remove anonym.to redirect
for a in soup.find_all('a', href=True):
if 'anonym.to/?' in a['href']:
a['href'] = re.sub(r'^https?://anonym\.to/\?+', '', a['href'])
# Strip all tag attributes
for tag in soup.find_all(True):
tag.attrs = {}
# Remove <title> (we won't use it for markdown anymore)
if soup.title:
soup.title.decompose()
# Extract and replace all <h1> tags
for h1 in soup.find_all('h1'):
prefix = ''
span = h1.find('span')
if span:
prefix = span.get_text(strip=True)
span.decompose()
main = h1.get_text(strip=True)
md_header = f"# {main}" if not prefix else f"# {main}\n\n## {prefix}"
new_tag = soup.new_tag("p")
new_tag.string = md_header
h1.replace_with(new_tag)
body = soup.body or soup
return str(body)
def clean_markdown(md):
md = re.sub(r'!\[\]\([^)]*bullet\.gif\)', '', md)
md = re.sub(r'^\s*#\s*(.*?)\s*##\s*(.*?)$', r'# \1\n\n## \2', md, flags=re.MULTILINE)
md = re.sub(r'\n{3,}', '\n\n', md)
md = re.sub(r'[ \t]+$', '', md, flags=re.MULTILINE)
return md.strip()
def safe_html_to_md(html):
try:
sanitized = sanitize_html(html)
md = html2text.html2text(sanitized)
return clean_markdown(md)
except Exception as e:
return f"```html\n{html.strip()}\n```"
def convert_txt_column_to_markdown(table):
ensure_markdown_column(table)
cursor.execute(f"SELECT ID, Txt FROM `{table}` WHERE Txt IS NOT NULL")
for id_val, html in cursor.fetchall():
try:
md = safe_html_to_md(html)
cursor.execute(f"UPDATE `{table}` SET Markdown=%s WHERE ID=%s", (md, id_val))
except Exception as e:
log_error(table, id_val, str(e))
# Main execution
for table in get_target_tables():
convert_txt_column_to_markdown(table)
print("✅ Markdown conversion complete. See 'markdown_conversion_errors.log' for any issues.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment