Created
February 21, 2026 02:41
-
-
Save siygle/4e21f2e15d2e85c58954c27c34851f96 to your computer and use it in GitHub Desktop.
Convert Notion exported files to pure markdown files (for Obsidian)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env node | |
| import { readdir, readFile, writeFile, copyFile, mkdir, stat, access } from 'node:fs/promises'; | |
| import { join, basename, extname, dirname, resolve } from 'node:path'; | |
| // --- Configuration --- | |
| const SOURCE_DIR = resolve(process.argv[2]); | |
| const OUTPUT_DIR = resolve('./output'); | |
| if (!SOURCE_DIR) throw new Error("Export files not existed!"); | |
| const ASSET_EXTENSIONS = new Set(['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.pdf', '.bmp', '.ico']); | |
| const UUID_REGEX = /^(.+)\s+([a-f0-9]{32})$/; | |
| // --- Stats --- | |
| const stats = { files: 0, assets: 0, warnings: [] }; | |
| function warn(msg) { | |
| stats.warnings.push(msg); | |
| } | |
| // --- Phase 1: Discovery --- | |
| async function discoverFiles(sourceDir) { | |
| const entries = await readdir(sourceDir, { recursive: true }); | |
| const mdFiles = []; | |
| for (const entry of entries) { | |
| if (entry.endsWith('.md')) { | |
| mdFiles.push(join(sourceDir, entry)); | |
| } | |
| } | |
| return mdFiles; | |
| } | |
| function cleanTitle(filename) { | |
| const name = basename(filename, '.md'); | |
| const match = name.match(UUID_REGEX); | |
| if (match) { | |
| return { title: match[1], uuid: match[2] }; | |
| } | |
| return { title: name, uuid: null }; | |
| } | |
| function getRelativeDir(filePath) { | |
| // Get the directory path relative to SOURCE_DIR | |
| const dir = dirname(filePath); | |
| if (dir === SOURCE_DIR) return ''; | |
| const rel = dir.slice(SOURCE_DIR.length + 1); // strip SOURCE_DIR + separator | |
| return rel; | |
| } | |
| function buildMappings(mdFiles) { | |
| // uuid -> { sourcePath, cleanTitle } | |
| const uuidMap = new Map(); | |
| // "relDir/title" -> sourcePath[] (collision detection scoped per directory) | |
| const dirTitleGroups = new Map(); | |
| // sourcePath -> { relDir, title } | |
| const outputMap = new Map(); | |
| for (const filePath of mdFiles) { | |
| const { title, uuid } = cleanTitle(filePath); | |
| const relDir = getRelativeDir(filePath); | |
| if (uuid) { | |
| uuidMap.set(uuid, { sourcePath: filePath, cleanTitle: title }); | |
| } | |
| const key = `${relDir}/${title}`; | |
| if (!dirTitleGroups.has(key)) { | |
| dirTitleGroups.set(key, []); | |
| } | |
| dirTitleGroups.get(key).push({ filePath, relDir, title }); | |
| } | |
| // Resolve collisions deterministically (scoped per directory) | |
| for (const [, entries] of dirTitleGroups) { | |
| entries.sort((a, b) => a.filePath.localeCompare(b.filePath)); | |
| if (entries.length === 1) { | |
| const e = entries[0]; | |
| outputMap.set(e.filePath, { relDir: e.relDir, title: e.title }); | |
| } else { | |
| for (let i = 0; i < entries.length; i++) { | |
| const e = entries[i]; | |
| const title = i === 0 ? e.title : `${e.title} (${i + 1})`; | |
| outputMap.set(e.filePath, { relDir: e.relDir, title }); | |
| } | |
| } | |
| } | |
| return { uuidMap, outputMap }; | |
| } | |
| // --- Phase 2: Metadata Parser --- | |
| const MONTHS = { | |
| 'january': 0, 'february': 1, 'march': 2, 'april': 3, | |
| 'may': 4, 'june': 5, 'july': 6, 'august': 7, | |
| 'september': 8, 'october': 9, 'november': 10, 'december': 11 | |
| }; | |
| function parseNotionDate(dateStr) { | |
| // Pattern 1: "2024/02/16" | |
| if (/^\d{4}\/\d{2}\/\d{2}$/.test(dateStr)) { | |
| return dateStr.replace(/\//g, '-'); | |
| } | |
| // Pattern 2: "October 19, 2010 12:36 PM" or "October 31, 2024 12:00 AM" | |
| const longMatch = dateStr.match(/^(\w+)\s+(\d{1,2}),\s+(\d{4})/); | |
| if (longMatch) { | |
| const month = MONTHS[longMatch[1].toLowerCase()]; | |
| if (month !== undefined) { | |
| const day = longMatch[2].padStart(2, '0'); | |
| const monthStr = String(month + 1).padStart(2, '0'); | |
| return `${longMatch[3]}-${monthStr}-${day}`; | |
| } | |
| } | |
| return null; | |
| } | |
| const METADATA_PATTERNS = [ | |
| { key: 'tags', regex: /^tags:\s*(.+)$/i }, | |
| { key: 'created', regex: /^(?:date|created|saved):\s*(.+)$/i }, | |
| // Fields we recognize but skip | |
| { key: '_skip', regex: /^(?:type|status|slug|summary|category|updated|url|like|property|name):\s*(.+)$/i }, | |
| ]; | |
| function parseMetadata(content) { | |
| const lines = content.split('\n'); | |
| let title = ''; | |
| let tags = []; | |
| let created = null; | |
| let bodyStartIndex = 0; | |
| // Extract title from first line | |
| let i = 0; | |
| for (; i < lines.length; i++) { | |
| const line = lines[i].trim(); | |
| if (line.startsWith('# ')) { | |
| title = line.slice(2).trim(); | |
| i++; | |
| break; | |
| } | |
| if (line !== '') break; // No title found | |
| } | |
| // Skip blank lines after title | |
| while (i < lines.length && lines[i].trim() === '') { | |
| i++; | |
| } | |
| // Parse metadata lines until blank line or unrecognized line | |
| const metaStart = i; | |
| let foundMeta = false; | |
| for (; i < lines.length; i++) { | |
| const line = lines[i].trim(); | |
| if (line === '') { | |
| // End of metadata block | |
| i++; | |
| break; | |
| } | |
| let matched = false; | |
| for (const { key, regex } of METADATA_PATTERNS) { | |
| const m = line.match(regex); | |
| if (m) { | |
| matched = true; | |
| if (key === 'tags') { | |
| tags = m[1].split(',').map(t => t.trim()).filter(Boolean); | |
| } else if (key === 'created') { | |
| const parsed = parseNotionDate(m[1].trim()); | |
| if (parsed) created = parsed; | |
| } | |
| // _skip: just consume the line | |
| break; | |
| } | |
| } | |
| if (!matched) { | |
| // This line is body content, not metadata | |
| // But only if we haven't found any metadata yet - it might be that | |
| // this file simply has no metadata block | |
| if (!foundMeta) { | |
| i = metaStart; | |
| } | |
| break; | |
| } | |
| foundMeta = true; | |
| } | |
| // If no metadata was found, reset to after title + blank lines | |
| if (!foundMeta) { | |
| i = metaStart; | |
| } | |
| bodyStartIndex = i; | |
| return { title, tags, created, bodyStartIndex }; | |
| } | |
| function generateFrontmatter(tags, created) { | |
| const parts = ['---']; | |
| if (tags.length > 0) { | |
| parts.push('tags:'); | |
| for (const tag of tags) { | |
| parts.push(` - ${tag}`); | |
| } | |
| } | |
| if (created) { | |
| parts.push(`created: ${created}`); | |
| } | |
| parts.push('---'); | |
| return parts.join('\n'); | |
| } | |
| // --- Balanced parentheses link extraction --- | |
| // Extract the path from a markdown link, handling balanced parentheses. | |
| // Returns [fullMatch, path, endIndex] or null. | |
| function extractBalancedPath(text, startIndex) { | |
| // startIndex should point to the '(' after ']' | |
| if (text[startIndex] !== '(') return null; | |
| let depth = 0; | |
| let i = startIndex; | |
| for (; i < text.length; i++) { | |
| if (text[i] === '(') depth++; | |
| else if (text[i] === ')') { | |
| depth--; | |
| if (depth === 0) { | |
| const path = text.slice(startIndex + 1, i); | |
| return { path, endIndex: i + 1 }; | |
| } | |
| } | |
| } | |
| return null; // unbalanced | |
| } | |
| // Find the closing ']' for a '[' at openIndex, handling nesting. | |
| function findClosingBracket(text, openIndex) { | |
| let depth = 0; | |
| for (let i = openIndex; i < text.length; i++) { | |
| if (text[i] === '[') depth++; | |
| else if (text[i] === ']') { | |
| depth--; | |
| if (depth === 0) return i; | |
| } | |
| } | |
| return -1; | |
| } | |
| // --- Phase 3: Image Link Rewriter --- | |
| function rewriteImageLinks(body, sourceMdDir, postTitle, assetsToCopy, assetsRelPath) { | |
| // Scan for ![ then find balanced ] followed by balanced () | |
| const imageStartRegex = /!\[/g; | |
| let result = ''; | |
| let lastIndex = 0; | |
| let m; | |
| while ((m = imageStartRegex.exec(body)) !== null) { | |
| const bracketOpen = m.index + 1; // index of '[' | |
| const bracketClose = findClosingBracket(body, bracketOpen); | |
| if (bracketClose === -1) continue; | |
| if (body[bracketClose + 1] !== '(') continue; | |
| const alt = body.slice(bracketOpen + 1, bracketClose); | |
| const parenStart = bracketClose + 1; | |
| const extracted = extractBalancedPath(body, parenStart); | |
| if (!extracted) continue; | |
| const rawPath = extracted.path; | |
| result += body.slice(lastIndex, m.index); | |
| lastIndex = extracted.endIndex; | |
| // Skip external URLs | |
| if (rawPath.startsWith('http://') || rawPath.startsWith('https://')) { | |
| result += ``; | |
| imageStartRegex.lastIndex = lastIndex; | |
| continue; | |
| } | |
| let decodedPath; | |
| try { | |
| decodedPath = decodeURIComponent(rawPath); | |
| } catch { | |
| warn(`Failed to decode image path: ${rawPath}`); | |
| result += ``; | |
| imageStartRegex.lastIndex = lastIndex; | |
| continue; | |
| } | |
| const absolutePath = resolve(sourceMdDir, decodedPath); | |
| const assetFilename = basename(decodedPath); | |
| const newAssetName = `${postTitle} - ${assetFilename}`; | |
| const safeAssetName = newAssetName.replace(/[<>:"/\\|?*]/g, '_'); | |
| assetsToCopy.push({ | |
| source: absolutePath, | |
| destName: safeAssetName, | |
| }); | |
| result += ``; | |
| imageStartRegex.lastIndex = lastIndex; | |
| } | |
| result += body.slice(lastIndex); | |
| return result; | |
| } | |
| // --- Phase 4: Internal Link Rewriter --- | |
| function rewriteInternalLinks(body, uuidMap) { | |
| // Scan for [ but NOT ![ (images already handled) | |
| // Use balanced bracket/paren matching | |
| const linkStartRegex = /(?<!!)\[/g; | |
| let result = ''; | |
| let lastIndex = 0; | |
| let m; | |
| while ((m = linkStartRegex.exec(body)) !== null) { | |
| const bracketOpen = m.index; // index of '[' | |
| const bracketClose = findClosingBracket(body, bracketOpen); | |
| if (bracketClose === -1) continue; | |
| if (body[bracketClose + 1] !== '(') continue; | |
| const text = body.slice(bracketOpen + 1, bracketClose); | |
| const parenStart = bracketClose + 1; | |
| const extracted = extractBalancedPath(body, parenStart); | |
| if (!extracted) continue; | |
| const rawPath = extracted.path; | |
| result += body.slice(lastIndex, m.index); | |
| lastIndex = extracted.endIndex; | |
| // Skip external URLs | |
| if (rawPath.startsWith('http://') || rawPath.startsWith('https://') || rawPath.startsWith('evernote://')) { | |
| result += `[${text}](${rawPath})`; | |
| linkStartRegex.lastIndex = lastIndex; | |
| continue; | |
| } | |
| let decodedPath; | |
| try { | |
| decodedPath = decodeURIComponent(rawPath); | |
| } catch { | |
| result += `[${text}](${rawPath})`; | |
| linkStartRegex.lastIndex = lastIndex; | |
| continue; | |
| } | |
| // Check if it's a .md link with UUID | |
| if (decodedPath.endsWith('.md')) { | |
| const nameWithoutExt = basename(decodedPath, '.md'); | |
| const uuidMatch = nameWithoutExt.match(UUID_REGEX); | |
| if (uuidMatch) { | |
| const uuid = uuidMatch[2]; | |
| const entry = uuidMap.get(uuid); | |
| if (entry) { | |
| const linkTarget = entry.cleanTitle; | |
| if (text === linkTarget) { | |
| result += `[[${linkTarget}]]`; | |
| } else { | |
| result += `[[${linkTarget}|${text}]]`; | |
| } | |
| linkStartRegex.lastIndex = lastIndex; | |
| continue; | |
| } | |
| warn(`Could not resolve internal link UUID: ${uuid} for "${text}"`); | |
| } | |
| } | |
| // Check if it's a .csv link (Notion database view) - strip to plain text | |
| if (decodedPath.endsWith('.csv')) { | |
| result += text; | |
| linkStartRegex.lastIndex = lastIndex; | |
| continue; | |
| } | |
| // Leave other links as-is | |
| result += `[${text}](${rawPath})`; | |
| linkStartRegex.lastIndex = lastIndex; | |
| } | |
| result += body.slice(lastIndex); | |
| return result; | |
| } | |
| // --- Phase 5: Main Processing --- | |
| async function fileExists(filePath) { | |
| try { | |
| await access(filePath); | |
| return true; | |
| } catch { | |
| return false; | |
| } | |
| } | |
| async function processFile(sourcePath, { relDir, title: outputFilename }, uuidMap) { | |
| const content = await readFile(sourcePath, 'utf-8'); | |
| const { title, tags, created, bodyStartIndex } = parseMetadata(content); | |
| const lines = content.split('\n'); | |
| const bodyLines = lines.slice(bodyStartIndex); | |
| let body = bodyLines.join('\n'); | |
| // Compute relative path from this file's output dir to the assets/ dir | |
| const depth = relDir ? relDir.split('/').length : 0; | |
| const assetsRelPath = (depth > 0 ? '../'.repeat(depth) : '') + 'assets/'; | |
| // Rewrite image links | |
| const assetsToCopy = []; | |
| const sourceMdDir = dirname(sourcePath); | |
| body = rewriteImageLinks(body, sourceMdDir, outputFilename, assetsToCopy, assetsRelPath); | |
| // Rewrite internal links | |
| body = rewriteInternalLinks(body, uuidMap); | |
| // Generate output content | |
| const frontmatter = generateFrontmatter(tags, created); | |
| const outputContent = `${frontmatter}\n\n${body.replace(/^\n+/, '')}`; | |
| // Write markdown file into the correct subdirectory | |
| const outputDir = relDir ? join(OUTPUT_DIR, relDir) : OUTPUT_DIR; | |
| await mkdir(outputDir, { recursive: true }); | |
| const outputPath = join(outputDir, `${outputFilename}.md`); | |
| await writeFile(outputPath, outputContent, 'utf-8'); | |
| stats.files++; | |
| // Copy assets | |
| for (const asset of assetsToCopy) { | |
| const destPath = join(OUTPUT_DIR, 'assets', asset.destName); | |
| if (await fileExists(asset.source)) { | |
| try { | |
| await copyFile(asset.source, destPath); | |
| stats.assets++; | |
| } catch (err) { | |
| warn(`Failed to copy asset ${asset.source}: ${err.message}`); | |
| } | |
| } else { | |
| warn(`Asset not found: ${asset.source} (referenced from ${basename(sourcePath)})`); | |
| } | |
| } | |
| } | |
| async function main() { | |
| console.log(`Source: ${SOURCE_DIR}`); | |
| console.log(`Output: ${OUTPUT_DIR}`); | |
| // Verify source exists | |
| if (!(await fileExists(SOURCE_DIR))) { | |
| console.error(`Source directory not found: ${SOURCE_DIR}`); | |
| process.exit(1); | |
| } | |
| // Create output directories | |
| await mkdir(OUTPUT_DIR, { recursive: true }); | |
| await mkdir(join(OUTPUT_DIR, 'assets'), { recursive: true }); | |
| // Phase 1: Discovery | |
| console.log('Discovering files...'); | |
| const mdFiles = await discoverFiles(SOURCE_DIR); | |
| console.log(`Found ${mdFiles.length} markdown files`); | |
| // Build mappings | |
| const { uuidMap, outputMap } = buildMappings(mdFiles); | |
| console.log(`UUID map: ${uuidMap.size} entries`); | |
| console.log(`Output map: ${outputMap.size} entries`); | |
| // Phase 2-5: Process each file | |
| console.log('Processing files...'); | |
| let processed = 0; | |
| for (const [sourcePath, outputInfo] of outputMap) { | |
| try { | |
| await processFile(sourcePath, outputInfo, uuidMap); | |
| } catch (err) { | |
| warn(`Failed to process ${basename(sourcePath)}: ${err.message}`); | |
| } | |
| processed++; | |
| if (processed % 200 === 0) { | |
| console.log(` ${processed}/${outputMap.size} files processed...`); | |
| } | |
| } | |
| // Summary | |
| console.log('\n--- Summary ---'); | |
| console.log(`Files converted: ${stats.files}`); | |
| console.log(`Assets copied: ${stats.assets}`); | |
| console.log(`Warnings: ${stats.warnings.length}`); | |
| if (stats.warnings.length > 0) { | |
| console.log('\nWarnings:'); | |
| for (const w of stats.warnings) { | |
| console.log(` - ${w}`); | |
| } | |
| } | |
| } | |
| main().catch(err => { | |
| console.error('Fatal error:', err); | |
| process.exit(1); | |
| }); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment