Skip to content

Instantly share code, notes, and snippets.

@siygle
Created February 21, 2026 02:41
Show Gist options
  • Select an option

  • Save siygle/4e21f2e15d2e85c58954c27c34851f96 to your computer and use it in GitHub Desktop.

Select an option

Save siygle/4e21f2e15d2e85c58954c27c34851f96 to your computer and use it in GitHub Desktop.
Convert Notion exported files to pure markdown files (for Obsidian)
#!/usr/bin/env node
import { readdir, readFile, writeFile, copyFile, mkdir, stat, access } from 'node:fs/promises';
import { join, basename, extname, dirname, resolve } from 'node:path';
// --- Configuration ---
const SOURCE_DIR = resolve(process.argv[2]);
const OUTPUT_DIR = resolve('./output');
if (!SOURCE_DIR) throw new Error("Export files not existed!");
const ASSET_EXTENSIONS = new Set(['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp', '.pdf', '.bmp', '.ico']);
const UUID_REGEX = /^(.+)\s+([a-f0-9]{32})$/;
// --- Stats ---
const stats = { files: 0, assets: 0, warnings: [] };
function warn(msg) {
stats.warnings.push(msg);
}
// --- Phase 1: Discovery ---
async function discoverFiles(sourceDir) {
const entries = await readdir(sourceDir, { recursive: true });
const mdFiles = [];
for (const entry of entries) {
if (entry.endsWith('.md')) {
mdFiles.push(join(sourceDir, entry));
}
}
return mdFiles;
}
function cleanTitle(filename) {
const name = basename(filename, '.md');
const match = name.match(UUID_REGEX);
if (match) {
return { title: match[1], uuid: match[2] };
}
return { title: name, uuid: null };
}
function getRelativeDir(filePath) {
// Get the directory path relative to SOURCE_DIR
const dir = dirname(filePath);
if (dir === SOURCE_DIR) return '';
const rel = dir.slice(SOURCE_DIR.length + 1); // strip SOURCE_DIR + separator
return rel;
}
function buildMappings(mdFiles) {
// uuid -> { sourcePath, cleanTitle }
const uuidMap = new Map();
// "relDir/title" -> sourcePath[] (collision detection scoped per directory)
const dirTitleGroups = new Map();
// sourcePath -> { relDir, title }
const outputMap = new Map();
for (const filePath of mdFiles) {
const { title, uuid } = cleanTitle(filePath);
const relDir = getRelativeDir(filePath);
if (uuid) {
uuidMap.set(uuid, { sourcePath: filePath, cleanTitle: title });
}
const key = `${relDir}/${title}`;
if (!dirTitleGroups.has(key)) {
dirTitleGroups.set(key, []);
}
dirTitleGroups.get(key).push({ filePath, relDir, title });
}
// Resolve collisions deterministically (scoped per directory)
for (const [, entries] of dirTitleGroups) {
entries.sort((a, b) => a.filePath.localeCompare(b.filePath));
if (entries.length === 1) {
const e = entries[0];
outputMap.set(e.filePath, { relDir: e.relDir, title: e.title });
} else {
for (let i = 0; i < entries.length; i++) {
const e = entries[i];
const title = i === 0 ? e.title : `${e.title} (${i + 1})`;
outputMap.set(e.filePath, { relDir: e.relDir, title });
}
}
}
return { uuidMap, outputMap };
}
// --- Phase 2: Metadata Parser ---
const MONTHS = {
'january': 0, 'february': 1, 'march': 2, 'april': 3,
'may': 4, 'june': 5, 'july': 6, 'august': 7,
'september': 8, 'october': 9, 'november': 10, 'december': 11
};
function parseNotionDate(dateStr) {
// Pattern 1: "2024/02/16"
if (/^\d{4}\/\d{2}\/\d{2}$/.test(dateStr)) {
return dateStr.replace(/\//g, '-');
}
// Pattern 2: "October 19, 2010 12:36 PM" or "October 31, 2024 12:00 AM"
const longMatch = dateStr.match(/^(\w+)\s+(\d{1,2}),\s+(\d{4})/);
if (longMatch) {
const month = MONTHS[longMatch[1].toLowerCase()];
if (month !== undefined) {
const day = longMatch[2].padStart(2, '0');
const monthStr = String(month + 1).padStart(2, '0');
return `${longMatch[3]}-${monthStr}-${day}`;
}
}
return null;
}
const METADATA_PATTERNS = [
{ key: 'tags', regex: /^tags:\s*(.+)$/i },
{ key: 'created', regex: /^(?:date|created|saved):\s*(.+)$/i },
// Fields we recognize but skip
{ key: '_skip', regex: /^(?:type|status|slug|summary|category|updated|url|like|property|name):\s*(.+)$/i },
];
function parseMetadata(content) {
const lines = content.split('\n');
let title = '';
let tags = [];
let created = null;
let bodyStartIndex = 0;
// Extract title from first line
let i = 0;
for (; i < lines.length; i++) {
const line = lines[i].trim();
if (line.startsWith('# ')) {
title = line.slice(2).trim();
i++;
break;
}
if (line !== '') break; // No title found
}
// Skip blank lines after title
while (i < lines.length && lines[i].trim() === '') {
i++;
}
// Parse metadata lines until blank line or unrecognized line
const metaStart = i;
let foundMeta = false;
for (; i < lines.length; i++) {
const line = lines[i].trim();
if (line === '') {
// End of metadata block
i++;
break;
}
let matched = false;
for (const { key, regex } of METADATA_PATTERNS) {
const m = line.match(regex);
if (m) {
matched = true;
if (key === 'tags') {
tags = m[1].split(',').map(t => t.trim()).filter(Boolean);
} else if (key === 'created') {
const parsed = parseNotionDate(m[1].trim());
if (parsed) created = parsed;
}
// _skip: just consume the line
break;
}
}
if (!matched) {
// This line is body content, not metadata
// But only if we haven't found any metadata yet - it might be that
// this file simply has no metadata block
if (!foundMeta) {
i = metaStart;
}
break;
}
foundMeta = true;
}
// If no metadata was found, reset to after title + blank lines
if (!foundMeta) {
i = metaStart;
}
bodyStartIndex = i;
return { title, tags, created, bodyStartIndex };
}
function generateFrontmatter(tags, created) {
const parts = ['---'];
if (tags.length > 0) {
parts.push('tags:');
for (const tag of tags) {
parts.push(` - ${tag}`);
}
}
if (created) {
parts.push(`created: ${created}`);
}
parts.push('---');
return parts.join('\n');
}
// --- Balanced parentheses link extraction ---
// Extract the path from a markdown link, handling balanced parentheses.
// Returns [fullMatch, path, endIndex] or null.
function extractBalancedPath(text, startIndex) {
// startIndex should point to the '(' after ']'
if (text[startIndex] !== '(') return null;
let depth = 0;
let i = startIndex;
for (; i < text.length; i++) {
if (text[i] === '(') depth++;
else if (text[i] === ')') {
depth--;
if (depth === 0) {
const path = text.slice(startIndex + 1, i);
return { path, endIndex: i + 1 };
}
}
}
return null; // unbalanced
}
// Find the closing ']' for a '[' at openIndex, handling nesting.
function findClosingBracket(text, openIndex) {
let depth = 0;
for (let i = openIndex; i < text.length; i++) {
if (text[i] === '[') depth++;
else if (text[i] === ']') {
depth--;
if (depth === 0) return i;
}
}
return -1;
}
// --- Phase 3: Image Link Rewriter ---
function rewriteImageLinks(body, sourceMdDir, postTitle, assetsToCopy, assetsRelPath) {
// Scan for ![ then find balanced ] followed by balanced ()
const imageStartRegex = /!\[/g;
let result = '';
let lastIndex = 0;
let m;
while ((m = imageStartRegex.exec(body)) !== null) {
const bracketOpen = m.index + 1; // index of '['
const bracketClose = findClosingBracket(body, bracketOpen);
if (bracketClose === -1) continue;
if (body[bracketClose + 1] !== '(') continue;
const alt = body.slice(bracketOpen + 1, bracketClose);
const parenStart = bracketClose + 1;
const extracted = extractBalancedPath(body, parenStart);
if (!extracted) continue;
const rawPath = extracted.path;
result += body.slice(lastIndex, m.index);
lastIndex = extracted.endIndex;
// Skip external URLs
if (rawPath.startsWith('http://') || rawPath.startsWith('https://')) {
result += `![${alt}](${rawPath})`;
imageStartRegex.lastIndex = lastIndex;
continue;
}
let decodedPath;
try {
decodedPath = decodeURIComponent(rawPath);
} catch {
warn(`Failed to decode image path: ${rawPath}`);
result += `![${alt}](${rawPath})`;
imageStartRegex.lastIndex = lastIndex;
continue;
}
const absolutePath = resolve(sourceMdDir, decodedPath);
const assetFilename = basename(decodedPath);
const newAssetName = `${postTitle} - ${assetFilename}`;
const safeAssetName = newAssetName.replace(/[<>:"/\\|?*]/g, '_');
assetsToCopy.push({
source: absolutePath,
destName: safeAssetName,
});
result += `![${alt}](${assetsRelPath}${safeAssetName})`;
imageStartRegex.lastIndex = lastIndex;
}
result += body.slice(lastIndex);
return result;
}
// --- Phase 4: Internal Link Rewriter ---
function rewriteInternalLinks(body, uuidMap) {
// Scan for [ but NOT ![ (images already handled)
// Use balanced bracket/paren matching
const linkStartRegex = /(?<!!)\[/g;
let result = '';
let lastIndex = 0;
let m;
while ((m = linkStartRegex.exec(body)) !== null) {
const bracketOpen = m.index; // index of '['
const bracketClose = findClosingBracket(body, bracketOpen);
if (bracketClose === -1) continue;
if (body[bracketClose + 1] !== '(') continue;
const text = body.slice(bracketOpen + 1, bracketClose);
const parenStart = bracketClose + 1;
const extracted = extractBalancedPath(body, parenStart);
if (!extracted) continue;
const rawPath = extracted.path;
result += body.slice(lastIndex, m.index);
lastIndex = extracted.endIndex;
// Skip external URLs
if (rawPath.startsWith('http://') || rawPath.startsWith('https://') || rawPath.startsWith('evernote://')) {
result += `[${text}](${rawPath})`;
linkStartRegex.lastIndex = lastIndex;
continue;
}
let decodedPath;
try {
decodedPath = decodeURIComponent(rawPath);
} catch {
result += `[${text}](${rawPath})`;
linkStartRegex.lastIndex = lastIndex;
continue;
}
// Check if it's a .md link with UUID
if (decodedPath.endsWith('.md')) {
const nameWithoutExt = basename(decodedPath, '.md');
const uuidMatch = nameWithoutExt.match(UUID_REGEX);
if (uuidMatch) {
const uuid = uuidMatch[2];
const entry = uuidMap.get(uuid);
if (entry) {
const linkTarget = entry.cleanTitle;
if (text === linkTarget) {
result += `[[${linkTarget}]]`;
} else {
result += `[[${linkTarget}|${text}]]`;
}
linkStartRegex.lastIndex = lastIndex;
continue;
}
warn(`Could not resolve internal link UUID: ${uuid} for "${text}"`);
}
}
// Check if it's a .csv link (Notion database view) - strip to plain text
if (decodedPath.endsWith('.csv')) {
result += text;
linkStartRegex.lastIndex = lastIndex;
continue;
}
// Leave other links as-is
result += `[${text}](${rawPath})`;
linkStartRegex.lastIndex = lastIndex;
}
result += body.slice(lastIndex);
return result;
}
// --- Phase 5: Main Processing ---
async function fileExists(filePath) {
try {
await access(filePath);
return true;
} catch {
return false;
}
}
async function processFile(sourcePath, { relDir, title: outputFilename }, uuidMap) {
const content = await readFile(sourcePath, 'utf-8');
const { title, tags, created, bodyStartIndex } = parseMetadata(content);
const lines = content.split('\n');
const bodyLines = lines.slice(bodyStartIndex);
let body = bodyLines.join('\n');
// Compute relative path from this file's output dir to the assets/ dir
const depth = relDir ? relDir.split('/').length : 0;
const assetsRelPath = (depth > 0 ? '../'.repeat(depth) : '') + 'assets/';
// Rewrite image links
const assetsToCopy = [];
const sourceMdDir = dirname(sourcePath);
body = rewriteImageLinks(body, sourceMdDir, outputFilename, assetsToCopy, assetsRelPath);
// Rewrite internal links
body = rewriteInternalLinks(body, uuidMap);
// Generate output content
const frontmatter = generateFrontmatter(tags, created);
const outputContent = `${frontmatter}\n\n${body.replace(/^\n+/, '')}`;
// Write markdown file into the correct subdirectory
const outputDir = relDir ? join(OUTPUT_DIR, relDir) : OUTPUT_DIR;
await mkdir(outputDir, { recursive: true });
const outputPath = join(outputDir, `${outputFilename}.md`);
await writeFile(outputPath, outputContent, 'utf-8');
stats.files++;
// Copy assets
for (const asset of assetsToCopy) {
const destPath = join(OUTPUT_DIR, 'assets', asset.destName);
if (await fileExists(asset.source)) {
try {
await copyFile(asset.source, destPath);
stats.assets++;
} catch (err) {
warn(`Failed to copy asset ${asset.source}: ${err.message}`);
}
} else {
warn(`Asset not found: ${asset.source} (referenced from ${basename(sourcePath)})`);
}
}
}
async function main() {
console.log(`Source: ${SOURCE_DIR}`);
console.log(`Output: ${OUTPUT_DIR}`);
// Verify source exists
if (!(await fileExists(SOURCE_DIR))) {
console.error(`Source directory not found: ${SOURCE_DIR}`);
process.exit(1);
}
// Create output directories
await mkdir(OUTPUT_DIR, { recursive: true });
await mkdir(join(OUTPUT_DIR, 'assets'), { recursive: true });
// Phase 1: Discovery
console.log('Discovering files...');
const mdFiles = await discoverFiles(SOURCE_DIR);
console.log(`Found ${mdFiles.length} markdown files`);
// Build mappings
const { uuidMap, outputMap } = buildMappings(mdFiles);
console.log(`UUID map: ${uuidMap.size} entries`);
console.log(`Output map: ${outputMap.size} entries`);
// Phase 2-5: Process each file
console.log('Processing files...');
let processed = 0;
for (const [sourcePath, outputInfo] of outputMap) {
try {
await processFile(sourcePath, outputInfo, uuidMap);
} catch (err) {
warn(`Failed to process ${basename(sourcePath)}: ${err.message}`);
}
processed++;
if (processed % 200 === 0) {
console.log(` ${processed}/${outputMap.size} files processed...`);
}
}
// Summary
console.log('\n--- Summary ---');
console.log(`Files converted: ${stats.files}`);
console.log(`Assets copied: ${stats.assets}`);
console.log(`Warnings: ${stats.warnings.length}`);
if (stats.warnings.length > 0) {
console.log('\nWarnings:');
for (const w of stats.warnings) {
console.log(` - ${w}`);
}
}
}
main().catch(err => {
console.error('Fatal error:', err);
process.exit(1);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment