Skip to content

Instantly share code, notes, and snippets.

@kjunichi
Created February 17, 2026 05:22
Show Gist options
  • Select an option

  • Save kjunichi/72c28e0eb64c5e6330af53604e666003 to your computer and use it in GitHub Desktop.

Select an option

Save kjunichi/72c28e0eb64c5e6330af53604e666003 to your computer and use it in GitHub Desktop.
const puppeteer = require('puppeteer');
const fs = require('fs/promises');
const path = require('path');
/**
* Sanitizes a string to be used as a valid filename.
* @param {string} title The string to sanitize.
* @returns {string} A sanitized string safe for use as a filename.
*/
function sanitizeFilename(title) {
// Replace invalid characters with an underscore
const invalidChars = /[\/:"*?<>|]/g;
let sanitized = title.replace(invalidChars, '_');
// Limit length to avoid issues with file systems
return sanitized.substring(0, 200);
}
/**
* Processes a single note page, saves its content, and finds the next page link.
* @param {import('puppeteer').Browser} browser The Puppeteer browser instance.
* @param {string} url The URL of the page to process.
* @returns {Promise<{nextUrl: string|null, pageTitle: string}>} The URL of the next article and the title of the current page.
*/
async function processPage(browser, url) {
const page = await browser.newPage();
let pageTitle = '';
try {
console.log(`Navigating to ${url}...`);
await page.goto(url, { waitUntil: 'networkidle2' });
console.log('Extracting main page title and content...');
const rawPageTitle = await page.title();
pageTitle = rawPageTitle.replace(/\s*|kjunichi/g, '').trim();
const mainContent = await page.evaluate(() => document.body.innerText);
let sanitizedTitle = sanitizeFilename(pageTitle);
if (!sanitizedTitle) {
console.warn('Page title was empty after sanitization. Using note ID as filename.');
try {
const urlObject = new URL(url);
const pathParts = urlObject.pathname.split('/');
const noteId = pathParts[pathParts.length - 1];
sanitizedTitle = sanitizeFilename(noteId);
} catch (e) {
sanitizedTitle = `untitled-${Date.now()}`;
}
}
const outputFilename = `${sanitizedTitle}.txt`;
console.log(`Output will be saved to: ${outputFilename}`);
console.log('Searching for Twitter links...');
const twitterUrls = await page.$$eval('a', (anchors) =>
anchors
.map((a) => a.href)
.filter((href) => href.includes('twitter.com/kjunichi/status/'))
);
console.log(`Found ${twitterUrls.length} Twitter link(s).`);
let allContent = [
`--- Page Content from: ${url} ---\n`,
mainContent,
`\n--- End of Page Content ---\n\n`
];
for (const twitterUrl of new Set(twitterUrls)) {
let twitterPage;
try {
console.log(`Scraping content from: ${twitterUrl}`);
twitterPage = await browser.newPage();
await twitterPage.goto(twitterUrl, { waitUntil: 'networkidle2' });
const twitterContent = await twitterPage.evaluate(() => document.body.innerText);
const marker = 'kjunichi @kjunichi';
const markerIndex = twitterContent.indexOf(marker);
const cleanedContent = markerIndex !== -1
? twitterContent.substring(markerIndex + marker.length).trim()
: twitterContent;
allContent.push(
`--- Content from Twitter URL: ${twitterUrl} ---\n`,
cleanedContent,
`\n--- End of Twitter Content ---\n\n`
);
} catch (err) {
console.error(`Failed to scrape ${twitterUrl}: ${err.message}`);
allContent.push(
`--- FAILED to get content from: ${twitterUrl} ---\n\n`
);
} finally {
if (twitterPage) await twitterPage.close();
}
}
console.log('Combining all content and saving to file...');
await fs.writeFile(outputFilename, allContent.join(`\n`));
console.log(`Successfully saved all content to ${path.resolve(outputFilename)}`);
console.log('Scrolling to the bottom of the page to trigger lazy-loaded elements...');
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
console.log('Waiting for potential next-link elements to appear...');
try {
await page.waitForSelector('a.o-prevNext__button--next, h2', { timeout: 5000 });
console.log('A potential next-link element was found.');
} catch (e) {
console.log('Did not find any potential next-link elements within the timeout.');
}
const nextUrl = await page.evaluate(() => {
// Strategy 1: Look for the specific "o-prevNext" component
const prevNextLink = document.querySelector('a.o-prevNext__button--next');
if (prevNextLink && prevNextLink.href) {
return prevNextLink.href;
}
// Strategy 2: Find a heading containing "翌日" and check its sibling
const h2s = Array.from(document.querySelectorAll('h2'));
const yokujitsuHeader = h2s.find(h2 => h2.innerText.includes('翌日'));
if (yokujitsuHeader) {
let sibling = yokujitsuHeader.nextElementSibling;
if (sibling) {
// Case A: The sibling itself is the link
if (sibling.tagName === 'A' && sibling.href) {
return sibling.href;
}
// Case B: The link is inside the sibling (e.g., in a <figure>)
const iframe = sibling.querySelector('iframe');
if (iframe) {
let sourceUrl = iframe.src;
// If src is empty, try data-src
if (!sourceUrl && iframe.dataset.src) {
sourceUrl = iframe.dataset.src;
}
if (sourceUrl) {
try {
const srcUrl = new URL(sourceUrl);
const pathParts = srcUrl.pathname.split('/');
const noteId = pathParts[pathParts.length - 1];
if (noteId) {
return `https://note.com/kjunichi/n/${noteId}`;
}
} catch (e) { /* Ignore */ }
}
}
const directLink = sibling.querySelector('a');
if (directLink && directLink.href) {
return directLink.href;
}
}
}
// Strategy 3: Fallback to find any link with "次の記事へ" text
const allLinks = Array.from(document.querySelectorAll('a'));
const nextArticleLink = allLinks.find(a => a.innerText.includes('次の記事へ'));
if (nextArticleLink && nextArticleLink.href) {
return nextArticleLink.href;
}
return null;
});
return { nextUrl, pageTitle };
} finally {
if (page) await page.close();
}
}
/**
* Main function to orchestrate the scraping loop.
*/
async function main() {
console.log('Launching browser...');
const browser = await puppeteer.launch({ headless: true });
//let currentUrl = ''; // Initial URL
let currentUrl = 'https://note.com/';
try {
while (currentUrl) {
const { nextUrl, pageTitle } = await processPage(browser, currentUrl);
if (pageTitle && pageTitle.includes('12月31日')) {
console.log('Reached December 31st. Stopping after this page.');
currentUrl = null;
} else if (nextUrl) {
console.log(`Next page found: ${nextUrl}`);
currentUrl = nextUrl;
} else {
console.log('No more pages to process.');
currentUrl = null;
}
}
} catch (error) {
console.error('An unexpected error occurred during the process:', error);
} finally {
console.log('Closing browser...');
await browser.close();
}
}
// --- Main Execution ---
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment