Created
February 17, 2026 05:22
-
-
Save kjunichi/72c28e0eb64c5e6330af53604e666003 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const puppeteer = require('puppeteer'); | |
| const fs = require('fs/promises'); | |
| const path = require('path'); | |
| /** | |
| * Sanitizes a string to be used as a valid filename. | |
| * @param {string} title The string to sanitize. | |
| * @returns {string} A sanitized string safe for use as a filename. | |
| */ | |
| function sanitizeFilename(title) { | |
| // Replace invalid characters with an underscore | |
| const invalidChars = /[\/:"*?<>|]/g; | |
| let sanitized = title.replace(invalidChars, '_'); | |
| // Limit length to avoid issues with file systems | |
| return sanitized.substring(0, 200); | |
| } | |
| /** | |
| * Processes a single note page, saves its content, and finds the next page link. | |
| * @param {import('puppeteer').Browser} browser The Puppeteer browser instance. | |
| * @param {string} url The URL of the page to process. | |
| * @returns {Promise<{nextUrl: string|null, pageTitle: string}>} The URL of the next article and the title of the current page. | |
| */ | |
| async function processPage(browser, url) { | |
| const page = await browser.newPage(); | |
| let pageTitle = ''; | |
| try { | |
| console.log(`Navigating to ${url}...`); | |
| await page.goto(url, { waitUntil: 'networkidle2' }); | |
| console.log('Extracting main page title and content...'); | |
| const rawPageTitle = await page.title(); | |
| pageTitle = rawPageTitle.replace(/\s*|kjunichi/g, '').trim(); | |
| const mainContent = await page.evaluate(() => document.body.innerText); | |
| let sanitizedTitle = sanitizeFilename(pageTitle); | |
| if (!sanitizedTitle) { | |
| console.warn('Page title was empty after sanitization. Using note ID as filename.'); | |
| try { | |
| const urlObject = new URL(url); | |
| const pathParts = urlObject.pathname.split('/'); | |
| const noteId = pathParts[pathParts.length - 1]; | |
| sanitizedTitle = sanitizeFilename(noteId); | |
| } catch (e) { | |
| sanitizedTitle = `untitled-${Date.now()}`; | |
| } | |
| } | |
| const outputFilename = `${sanitizedTitle}.txt`; | |
| console.log(`Output will be saved to: ${outputFilename}`); | |
| console.log('Searching for Twitter links...'); | |
| const twitterUrls = await page.$$eval('a', (anchors) => | |
| anchors | |
| .map((a) => a.href) | |
| .filter((href) => href.includes('twitter.com/kjunichi/status/')) | |
| ); | |
| console.log(`Found ${twitterUrls.length} Twitter link(s).`); | |
| let allContent = [ | |
| `--- Page Content from: ${url} ---\n`, | |
| mainContent, | |
| `\n--- End of Page Content ---\n\n` | |
| ]; | |
| for (const twitterUrl of new Set(twitterUrls)) { | |
| let twitterPage; | |
| try { | |
| console.log(`Scraping content from: ${twitterUrl}`); | |
| twitterPage = await browser.newPage(); | |
| await twitterPage.goto(twitterUrl, { waitUntil: 'networkidle2' }); | |
| const twitterContent = await twitterPage.evaluate(() => document.body.innerText); | |
| const marker = 'kjunichi @kjunichi'; | |
| const markerIndex = twitterContent.indexOf(marker); | |
| const cleanedContent = markerIndex !== -1 | |
| ? twitterContent.substring(markerIndex + marker.length).trim() | |
| : twitterContent; | |
| allContent.push( | |
| `--- Content from Twitter URL: ${twitterUrl} ---\n`, | |
| cleanedContent, | |
| `\n--- End of Twitter Content ---\n\n` | |
| ); | |
| } catch (err) { | |
| console.error(`Failed to scrape ${twitterUrl}: ${err.message}`); | |
| allContent.push( | |
| `--- FAILED to get content from: ${twitterUrl} ---\n\n` | |
| ); | |
| } finally { | |
| if (twitterPage) await twitterPage.close(); | |
| } | |
| } | |
| console.log('Combining all content and saving to file...'); | |
| await fs.writeFile(outputFilename, allContent.join(`\n`)); | |
| console.log(`Successfully saved all content to ${path.resolve(outputFilename)}`); | |
| console.log('Scrolling to the bottom of the page to trigger lazy-loaded elements...'); | |
| await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); | |
| console.log('Waiting for potential next-link elements to appear...'); | |
| try { | |
| await page.waitForSelector('a.o-prevNext__button--next, h2', { timeout: 5000 }); | |
| console.log('A potential next-link element was found.'); | |
| } catch (e) { | |
| console.log('Did not find any potential next-link elements within the timeout.'); | |
| } | |
| const nextUrl = await page.evaluate(() => { | |
| // Strategy 1: Look for the specific "o-prevNext" component | |
| const prevNextLink = document.querySelector('a.o-prevNext__button--next'); | |
| if (prevNextLink && prevNextLink.href) { | |
| return prevNextLink.href; | |
| } | |
| // Strategy 2: Find a heading containing "翌日" and check its sibling | |
| const h2s = Array.from(document.querySelectorAll('h2')); | |
| const yokujitsuHeader = h2s.find(h2 => h2.innerText.includes('翌日')); | |
| if (yokujitsuHeader) { | |
| let sibling = yokujitsuHeader.nextElementSibling; | |
| if (sibling) { | |
| // Case A: The sibling itself is the link | |
| if (sibling.tagName === 'A' && sibling.href) { | |
| return sibling.href; | |
| } | |
| // Case B: The link is inside the sibling (e.g., in a <figure>) | |
| const iframe = sibling.querySelector('iframe'); | |
| if (iframe) { | |
| let sourceUrl = iframe.src; | |
| // If src is empty, try data-src | |
| if (!sourceUrl && iframe.dataset.src) { | |
| sourceUrl = iframe.dataset.src; | |
| } | |
| if (sourceUrl) { | |
| try { | |
| const srcUrl = new URL(sourceUrl); | |
| const pathParts = srcUrl.pathname.split('/'); | |
| const noteId = pathParts[pathParts.length - 1]; | |
| if (noteId) { | |
| return `https://note.com/kjunichi/n/${noteId}`; | |
| } | |
| } catch (e) { /* Ignore */ } | |
| } | |
| } | |
| const directLink = sibling.querySelector('a'); | |
| if (directLink && directLink.href) { | |
| return directLink.href; | |
| } | |
| } | |
| } | |
| // Strategy 3: Fallback to find any link with "次の記事へ" text | |
| const allLinks = Array.from(document.querySelectorAll('a')); | |
| const nextArticleLink = allLinks.find(a => a.innerText.includes('次の記事へ')); | |
| if (nextArticleLink && nextArticleLink.href) { | |
| return nextArticleLink.href; | |
| } | |
| return null; | |
| }); | |
| return { nextUrl, pageTitle }; | |
| } finally { | |
| if (page) await page.close(); | |
| } | |
| } | |
| /** | |
| * Main function to orchestrate the scraping loop. | |
| */ | |
| async function main() { | |
| console.log('Launching browser...'); | |
| const browser = await puppeteer.launch({ headless: true }); | |
| //let currentUrl = ''; // Initial URL | |
| let currentUrl = 'https://note.com/'; | |
| try { | |
| while (currentUrl) { | |
| const { nextUrl, pageTitle } = await processPage(browser, currentUrl); | |
| if (pageTitle && pageTitle.includes('12月31日')) { | |
| console.log('Reached December 31st. Stopping after this page.'); | |
| currentUrl = null; | |
| } else if (nextUrl) { | |
| console.log(`Next page found: ${nextUrl}`); | |
| currentUrl = nextUrl; | |
| } else { | |
| console.log('No more pages to process.'); | |
| currentUrl = null; | |
| } | |
| } | |
| } catch (error) { | |
| console.error('An unexpected error occurred during the process:', error); | |
| } finally { | |
| console.log('Closing browser...'); | |
| await browser.close(); | |
| } | |
| } | |
| // --- Main Execution --- | |
| main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment