Skip to content

Instantly share code, notes, and snippets.

@ErikBrendel
Last active November 27, 2025 09:00
Show Gist options
  • Select an option

  • Save ErikBrendel/240864885851acd090bc7a516b8a3ca9 to your computer and use it in GitHub Desktop.

Select an option

Save ErikBrendel/240864885851acd090bc7a516b8a3ca9 to your computer and use it in GitHub Desktop.
Extract archbee documentation document tree
#!/usr/bin/env node
/**
* Extracts all documentation pages from Archbee __NEXT_DATA__
*/
const https = require("https")
function fetch(url) {
return new Promise((resolve, reject) => {
https
.get(url, (res) => {
let data = ""
res.on("data", (chunk) => (data += chunk))
res.on("end", () => resolve({ status: res.statusCode, body: data }))
})
.on("error", reject)
})
}
function extractNextData(html) {
const match = html.match(/<script id="__NEXT_DATA__"[^>]*>([^<]+)<\/script>/)
if (!match) return null
try {
return JSON.parse(match[1])
} catch (e) {
console.error("Failed to parse __NEXT_DATA__:", e.message)
return null
}
}
function extractPages(nextData) {
const pages = []
// Navigate to the document tree
const docSpace = nextData?.props?.pageProps?._docSpace
if (!docSpace || !docSpace.publicDocsTree) {
console.error("Could not find publicDocsTree in data structure")
return pages
}
const tree = docSpace.publicDocsTree
// Recursively extract all documents from tree
function extractFromNode(node, ancestors = []) {
if (!node || !node.id) return
const docId = node.id.replace("PUBLISHED-", "")
const urlPath = "/" + node.urlKey
const title = node.name || node.categoryName
// Extract just the slug (last segment of urlKey)
const slug = node.urlKey.split("/").pop()
const ancestorsPath = ancestors.join("/")
if (docId && urlPath && title) {
pages.push({ title, urlPath, ancestors: ancestorsPath, docId })
}
// Process children recursively, passing current slug as ancestor
if (node.children && Array.isArray(node.children)) {
const childAncestors = [...ancestors, slug]
node.children.forEach((child) => extractFromNode(child, childAncestors))
}
}
// Extract from each root node
tree.forEach((node) => extractFromNode(node, []))
return pages
}
function escapeCSV(value) {
if (value.includes('"') || value.includes(",") || value.includes("\n")) {
return '"' + value.replace(/"/g, '""') + '"'
}
return value
}
async function main() {
const docsUrl = process.argv[2]
if (!docsUrl) {
console.error("Usage: node archbee-parse.cjs <docs-url>")
console.error("Example: node archbee-parse.cjs https://docs.your-page.com/")
process.exit(1)
}
console.error("Fetching documentation data...\n")
const { status, body } = await fetch(docsUrl)
if (status !== 200) {
console.error(`Failed to fetch page: HTTP ${status}`)
process.exit(1)
}
const nextData = extractNextData(body)
if (!nextData) {
console.error("Could not extract __NEXT_DATA__ from page")
process.exit(1)
}
const pages = extractPages(nextData)
console.error(`Found ${pages.length} pages.\n`)
// Output as CSV
console.log("title,urlPath,ancestors,ID")
pages.forEach((page) => {
console.log(
`${escapeCSV(page.title)},${escapeCSV(page.urlPath)},${escapeCSV(page.ancestors)},${page.docId}`,
)
})
}
main().catch(console.error)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment