Install dependencies: npm install playwright cheerio node-html-markdown
Install firefox browser: npx playwright install firefox
Run the script: tsx html-to-md.ts https://mistral.ai/fr/news/mixtral-8x22b/
| import { load } from 'cheerio' | |
| import type { Browser, BrowserContext } from 'playwright' | |
| import { firefox } from 'playwright' | |
| import { NodeHtmlMarkdown } from 'node-html-markdown' | |
| export type Webpage = { | |
| url: string | |
| html: string | |
| markdown: string | |
| } | |
| export class PageReader { | |
| private browser?: Browser | |
| private context?: BrowserContext | |
| async init() { | |
| this.browser = await firefox.launch({ | |
| headless: true, | |
| }) | |
| this.context = await this.browser.newContext() | |
| } | |
| async read(pageUrl: string, selector?: string) { | |
| const page = await this.context.newPage() | |
| try { | |
| await page.goto(pageUrl) | |
| const pageHtml = await page.evaluate(() => { | |
| return globalThis.document.documentElement.outerHTML | |
| }) | |
| const contentHtml = this.sanitizeHtml(pageHtml, selector) | |
| return { | |
| url: pageUrl, | |
| html: contentHtml, | |
| markdown: NodeHtmlMarkdown.translate(contentHtml), | |
| } | |
| } finally { | |
| await page.close() | |
| } | |
| } | |
| async dispose() { | |
| if (this.context) { | |
| await this.context.close() | |
| } | |
| if (this.browser) { | |
| await this.browser.close() | |
| } | |
| } | |
| private sanitizeHtml(html: string, selector?: string) { | |
| const $ = load(html) | |
| if (selector) { | |
| const selectedHtml = $(selector).html() | |
| if (!selectedHtml || !selectedHtml.trim()) { | |
| throw new Error(`No content found for selector: ${selector}`) | |
| } | |
| return selectedHtml | |
| } | |
| $('script, style, path, footer, header, head').remove() | |
| return $.html() | |
| } | |
| } | |
| async function main() { | |
| const pageReader = new PageReader() | |
| await pageReader.init() | |
| const page = await pageReader.read(process.argv[2]) | |
| await pageReader.dispose() | |
| console.log(page.markdown) | |
| } | |
| main() |