Last active
July 7, 2025 16:19
-
-
Save mbutler/e4c9b2ef00f64bb1a66332a469a309aa to your computer and use it in GitHub Desktop.
convert html to markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pymysql | |
| import re | |
| from bs4 import BeautifulSoup, XMLParsedAsHTMLWarning | |
| import html2text | |
| import warnings | |
| from pathlib import Path | |
| # Suppress BeautifulSoup XMLParsedAsHTMLWarning | |
| warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning) | |
| # CONFIG | |
| DB_NAME = '4e_compendium_backup' | |
| DB_USER = 'mbutler' | |
| DB_PASS = '' | |
| LOG_FILE = 'markdown_conversion_errors.log' | |
| # DB Connection | |
| conn = pymysql.connect( | |
| host='localhost', | |
| user=DB_USER, | |
| password=DB_PASS, | |
| database=DB_NAME, | |
| charset='utf8mb4', | |
| autocommit=True | |
| ) | |
| cursor = conn.cursor() | |
| Path(LOG_FILE).write_text('') | |
| def log_error(table, id_val, error): | |
| with open(LOG_FILE, 'a', encoding='utf8') as f: | |
| f.write(f"[{table} ID={id_val}] Error: {error}\n") | |
| def has_column(table, column): | |
| cursor.execute(f"SHOW COLUMNS FROM `{table}` LIKE %s", (column,)) | |
| return cursor.fetchone() is not None | |
| def ensure_markdown_column(table): | |
| if not has_column(table, 'Markdown'): | |
| cursor.execute(f"ALTER TABLE `{table}` ADD COLUMN `Markdown` LONGTEXT CHARACTER SET utf8mb4") | |
| def get_target_tables(): | |
| cursor.execute("SHOW TABLES") | |
| return [row[0] for row in cursor.fetchall() if has_column(row[0], 'Txt')] | |
| def sanitize_html(raw_html): | |
| soup = BeautifulSoup(raw_html, 'html.parser') | |
| # Remove script/style | |
| for tag in soup(['script', 'style']): | |
| tag.decompose() | |
| # Remove anonym.to redirect | |
| for a in soup.find_all('a', href=True): | |
| if 'anonym.to/?' in a['href']: | |
| a['href'] = re.sub(r'^https?://anonym\.to/\?+', '', a['href']) | |
| # Strip all tag attributes | |
| for tag in soup.find_all(True): | |
| tag.attrs = {} | |
| # Remove <title> (we won't use it for markdown anymore) | |
| if soup.title: | |
| soup.title.decompose() | |
| # Extract and replace all <h1> tags | |
| for h1 in soup.find_all('h1'): | |
| prefix = '' | |
| span = h1.find('span') | |
| if span: | |
| prefix = span.get_text(strip=True) | |
| span.decompose() | |
| main = h1.get_text(strip=True) | |
| md_header = f"# {main}" if not prefix else f"# {main}\n\n## {prefix}" | |
| new_tag = soup.new_tag("p") | |
| new_tag.string = md_header | |
| h1.replace_with(new_tag) | |
| body = soup.body or soup | |
| return str(body) | |
| def clean_markdown(md): | |
| md = re.sub(r'!\[\]\([^)]*bullet\.gif\)', '', md) | |
| md = re.sub(r'^\s*#\s*(.*?)\s*##\s*(.*?)$', r'# \1\n\n## \2', md, flags=re.MULTILINE) | |
| md = re.sub(r'\n{3,}', '\n\n', md) | |
| md = re.sub(r'[ \t]+$', '', md, flags=re.MULTILINE) | |
| return md.strip() | |
| def safe_html_to_md(html): | |
| try: | |
| sanitized = sanitize_html(html) | |
| md = html2text.html2text(sanitized) | |
| return clean_markdown(md) | |
| except Exception as e: | |
| return f"```html\n{html.strip()}\n```" | |
| def convert_txt_column_to_markdown(table): | |
| ensure_markdown_column(table) | |
| cursor.execute(f"SELECT ID, Txt FROM `{table}` WHERE Txt IS NOT NULL") | |
| for id_val, html in cursor.fetchall(): | |
| try: | |
| md = safe_html_to_md(html) | |
| cursor.execute(f"UPDATE `{table}` SET Markdown=%s WHERE ID=%s", (md, id_val)) | |
| except Exception as e: | |
| log_error(table, id_val, str(e)) | |
| # Main execution | |
| for table in get_target_tables(): | |
| convert_txt_column_to_markdown(table) | |
| print("✅ Markdown conversion complete. See 'markdown_conversion_errors.log' for any issues.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment