kjunichi/gist:72c28e0eb64c5e6330af53604e666003

## gistfile1.js
const puppeteer = require('puppeteer');
const fs = require('fs/promises');
const path = require('path');

/**
 * Sanitizes a string to be used as a valid filename.
 * @param {string} title The string to sanitize.
 * @returns {string} A sanitized string safe for use as a filename.
 */
function sanitizeFilename(title) {
  // Replace invalid characters with an underscore
  const invalidChars = /[\/:"*?<>|]/g;
  let sanitized = title.replace(invalidChars, '_');
  // Limit length to avoid issues with file systems
  return sanitized.substring(0, 200);
}

/**
 * Processes a single note page, saves its content, and finds the next page link.
 * @param {import('puppeteer').Browser} browser The Puppeteer browser instance.
 * @param {string} url The URL of the page to process.
 * @returns {Promise<{nextUrl: string|null, pageTitle: string}>} The URL of the next article and the title of the current page.
 */
async function processPage(browser, url) {
  const page = await browser.newPage();
  let pageTitle = '';
  try {
    console.log(`Navigating to ${url}...`);
    await page.goto(url, { waitUntil: 'networkidle2' });

    console.log('Extracting main page title and content...');
    const rawPageTitle = await page.title();
    pageTitle = rawPageTitle.replace(/\s*｜kjunichi/g, '').trim();
    const mainContent = await page.evaluate(() => document.body.innerText);

    let sanitizedTitle = sanitizeFilename(pageTitle);

    if (!sanitizedTitle) {
        console.warn('Page title was empty after sanitization. Using note ID as filename.');
        try {
            const urlObject = new URL(url);
            const pathParts = urlObject.pathname.split('/');
            const noteId = pathParts[pathParts.length - 1];
            sanitizedTitle = sanitizeFilename(noteId);
        } catch (e) {
            sanitizedTitle = `untitled-${Date.now()}`;
        }
    }

    const outputFilename = `${sanitizedTitle}.txt`;

    console.log(`Output will be saved to: ${outputFilename}`);

    console.log('Searching for Twitter links...');
    const twitterUrls = await page.$$eval('a', (anchors) =>
      anchors
        .map((a) => a.href)
        .filter((href) => href.includes('twitter.com/kjunichi/status/'))
    );

    console.log(`Found ${twitterUrls.length} Twitter link(s).`);

    let allContent = [
      `--- Page Content from: ${url} ---\n`,
      mainContent,
      `\n--- End of Page Content ---\n\n`
    ];

    for (const twitterUrl of new Set(twitterUrls)) {
      let twitterPage;
      try {
        console.log(`Scraping content from: ${twitterUrl}`);
        twitterPage = await browser.newPage();
        await twitterPage.goto(twitterUrl, { waitUntil: 'networkidle2' });
        const twitterContent = await twitterPage.evaluate(() => document.body.innerText);
        const marker = 'kjunichi @kjunichi';
        const markerIndex = twitterContent.indexOf(marker);
        const cleanedContent = markerIndex !== -1
          ? twitterContent.substring(markerIndex + marker.length).trim()
          : twitterContent;
        allContent.push(
          `--- Content from Twitter URL: ${twitterUrl} ---\n`,
          cleanedContent,
          `\n--- End of Twitter Content ---\n\n`
        );
      } catch (err) {
        console.error(`Failed to scrape ${twitterUrl}: ${err.message}`);
        allContent.push(
          `--- FAILED to get content from: ${twitterUrl} ---\n\n`
        );
      } finally {
        if (twitterPage) await twitterPage.close();
      }
    }

    console.log('Combining all content and saving to file...');
    await fs.writeFile(outputFilename, allContent.join(`\n`));
    console.log(`Successfully saved all content to ${path.resolve(outputFilename)}`);

    console.log('Scrolling to the bottom of the page to trigger lazy-loaded elements...');
    await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));

    console.log('Waiting for potential next-link elements to appear...');
    try {
      await page.waitForSelector('a.o-prevNext__button--next, h2', { timeout: 5000 });
      console.log('A potential next-link element was found.');
    } catch (e) {
      console.log('Did not find any potential next-link elements within the timeout.');
    }

    const nextUrl = await page.evaluate(() => {
        // Strategy 1: Look for the specific "o-prevNext" component
        const prevNextLink = document.querySelector('a.o-prevNext__button--next');
        if (prevNextLink && prevNextLink.href) {
            return prevNextLink.href;
        }

        // Strategy 2: Find a heading containing "翌日" and check its sibling
        const h2s = Array.from(document.querySelectorAll('h2'));
        const yokujitsuHeader = h2s.find(h2 => h2.innerText.includes('翌日'));
        if (yokujitsuHeader) {
            let sibling = yokujitsuHeader.nextElementSibling;
            if (sibling) {
                // Case A: The sibling itself is the link
                if (sibling.tagName === 'A' && sibling.href) {
                    return sibling.href;
                }
                // Case B: The link is inside the sibling (e.g., in a <figure>)
                const iframe = sibling.querySelector('iframe');
                if (iframe) {
                    let sourceUrl = iframe.src;
                    // If src is empty, try data-src
                    if (!sourceUrl && iframe.dataset.src) {
                        sourceUrl = iframe.dataset.src;
                    }

                    if (sourceUrl) {
                        try {
                            const srcUrl = new URL(sourceUrl);
                            const pathParts = srcUrl.pathname.split('/');
                            const noteId = pathParts[pathParts.length - 1];
                            if (noteId) {
                                return `https://note.com/kjunichi/n/${noteId}`;
                            }
                        } catch (e) { /* Ignore */ }
                    }
                }
                const directLink = sibling.querySelector('a');
                if (directLink && directLink.href) {
                    return directLink.href;
                }
            }
        }

        // Strategy 3: Fallback to find any link with "次の記事へ" text
        const allLinks = Array.from(document.querySelectorAll('a'));
        const nextArticleLink = allLinks.find(a => a.innerText.includes('次の記事へ'));
        if (nextArticleLink && nextArticleLink.href) {
            return nextArticleLink.href;
        }

        return null;
    });

    return { nextUrl, pageTitle };

  } finally {
    if (page) await page.close();
  }
}


/**
 * Main function to orchestrate the scraping loop.
 */
async function main() {
  console.log('Launching browser...');
  const browser = await puppeteer.launch({ headless: true });

  //let currentUrl = ''; // Initial URL
  let currentUrl = 'https://note.com/';
  try {
    while (currentUrl) {
      const { nextUrl, pageTitle } = await processPage(browser, currentUrl);

      if (pageTitle && pageTitle.includes('12月31日')) {
        console.log('Reached December 31st. Stopping after this page.');
        currentUrl = null;
      } else if (nextUrl) {
        console.log(`Next page found: ${nextUrl}`);
        currentUrl = nextUrl;
      } else {
        console.log('No more pages to process.');
        currentUrl = null;
      }
    }
  } catch (error) {
    console.error('An unexpected error occurred during the process:', error);
  } finally {
    console.log('Closing browser...');
    await browser.close();
  }
}

// --- Main Execution ---
main();
	const puppeteer = require('puppeteer');
	const fs = require('fs/promises');
	const path = require('path');

	/**
	* Sanitizes a string to be used as a valid filename.
	* @param {string} title The string to sanitize.
	* @returns {string} A sanitized string safe for use as a filename.
	*/
	function sanitizeFilename(title) {
	// Replace invalid characters with an underscore
	const invalidChars = /[\/:"*?<>\|]/g;
	let sanitized = title.replace(invalidChars, '_');
	// Limit length to avoid issues with file systems
	return sanitized.substring(0, 200);
	}

	/**
	* Processes a single note page, saves its content, and finds the next page link.
	* @param {import('puppeteer').Browser} browser The Puppeteer browser instance.
	* @param {string} url The URL of the page to process.
	* @returns {Promise<{nextUrl: string\|null, pageTitle: string}>} The URL of the next article and the title of the current page.
	*/
	async function processPage(browser, url) {
	const page = await browser.newPage();
	let pageTitle = '';
	try {
	console.log(`Navigating to ${url}...`);
	await page.goto(url, { waitUntil: 'networkidle2' });

	console.log('Extracting main page title and content...');
	const rawPageTitle = await page.title();
	pageTitle = rawPageTitle.replace(/\s*｜kjunichi/g, '').trim();
	const mainContent = await page.evaluate(() => document.body.innerText);

	let sanitizedTitle = sanitizeFilename(pageTitle);

	if (!sanitizedTitle) {
	console.warn('Page title was empty after sanitization. Using note ID as filename.');
	try {
	const urlObject = new URL(url);
	const pathParts = urlObject.pathname.split('/');
	const noteId = pathParts[pathParts.length - 1];
	sanitizedTitle = sanitizeFilename(noteId);
	} catch (e) {
	sanitizedTitle = `untitled-${Date.now()}`;
	}
	}

	const outputFilename = `${sanitizedTitle}.txt`;

	console.log(`Output will be saved to: ${outputFilename}`);

	console.log('Searching for Twitter links...');
	const twitterUrls = await page.$$eval('a', (anchors) =>
	anchors
	.map((a) => a.href)
	.filter((href) => href.includes('twitter.com/kjunichi/status/'))
	);

	console.log(`Found ${twitterUrls.length} Twitter link(s).`);

	let allContent = [
	`--- Page Content from: ${url} ---\n`,
	mainContent,
	`\n--- End of Page Content ---\n\n`
	];

	for (const twitterUrl of new Set(twitterUrls)) {
	let twitterPage;
	try {
	console.log(`Scraping content from: ${twitterUrl}`);
	twitterPage = await browser.newPage();
	await twitterPage.goto(twitterUrl, { waitUntil: 'networkidle2' });
	const twitterContent = await twitterPage.evaluate(() => document.body.innerText);
	const marker = 'kjunichi @kjunichi';
	const markerIndex = twitterContent.indexOf(marker);
	const cleanedContent = markerIndex !== -1
	? twitterContent.substring(markerIndex + marker.length).trim()
	: twitterContent;
	allContent.push(
	`--- Content from Twitter URL: ${twitterUrl} ---\n`,
	cleanedContent,
	`\n--- End of Twitter Content ---\n\n`
	);
	} catch (err) {
	console.error(`Failed to scrape ${twitterUrl}: ${err.message}`);
	allContent.push(
	`--- FAILED to get content from: ${twitterUrl} ---\n\n`
	);
	} finally {
	if (twitterPage) await twitterPage.close();
	}
	}

	console.log('Combining all content and saving to file...');
	await fs.writeFile(outputFilename, allContent.join(`\n`));
	console.log(`Successfully saved all content to ${path.resolve(outputFilename)}`);

	console.log('Scrolling to the bottom of the page to trigger lazy-loaded elements...');
	await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));

	console.log('Waiting for potential next-link elements to appear...');
	try {
	await page.waitForSelector('a.o-prevNext__button--next, h2', { timeout: 5000 });
	console.log('A potential next-link element was found.');
	} catch (e) {
	console.log('Did not find any potential next-link elements within the timeout.');
	}

	const nextUrl = await page.evaluate(() => {
	// Strategy 1: Look for the specific "o-prevNext" component
	const prevNextLink = document.querySelector('a.o-prevNext__button--next');
	if (prevNextLink && prevNextLink.href) {
	return prevNextLink.href;
	}

	// Strategy 2: Find a heading containing "翌日" and check its sibling
	const h2s = Array.from(document.querySelectorAll('h2'));
	const yokujitsuHeader = h2s.find(h2 => h2.innerText.includes('翌日'));
	if (yokujitsuHeader) {
	let sibling = yokujitsuHeader.nextElementSibling;
	if (sibling) {
	// Case A: The sibling itself is the link
	if (sibling.tagName === 'A' && sibling.href) {
	return sibling.href;
	}
	// Case B: The link is inside the sibling (e.g., in a <figure>)
	const iframe = sibling.querySelector('iframe');
	if (iframe) {
	let sourceUrl = iframe.src;
	// If src is empty, try data-src
	if (!sourceUrl && iframe.dataset.src) {
	sourceUrl = iframe.dataset.src;
	}

	if (sourceUrl) {
	try {
	const srcUrl = new URL(sourceUrl);
	const pathParts = srcUrl.pathname.split('/');
	const noteId = pathParts[pathParts.length - 1];
	if (noteId) {
	return `https://note.com/kjunichi/n/${noteId}`;
	}
	} catch (e) { /* Ignore */ }
	}
	}
	const directLink = sibling.querySelector('a');
	if (directLink && directLink.href) {
	return directLink.href;
	}
	}
	}

	// Strategy 3: Fallback to find any link with "次の記事へ" text
	const allLinks = Array.from(document.querySelectorAll('a'));
	const nextArticleLink = allLinks.find(a => a.innerText.includes('次の記事へ'));
	if (nextArticleLink && nextArticleLink.href) {
	return nextArticleLink.href;
	}

	return null;
	});

	return { nextUrl, pageTitle };

	} finally {
	if (page) await page.close();
	}
	}


	/**
	* Main function to orchestrate the scraping loop.
	*/
	async function main() {
	console.log('Launching browser...');
	const browser = await puppeteer.launch({ headless: true });

	//let currentUrl = ''; // Initial URL
	let currentUrl = 'https://note.com/';
	try {
	while (currentUrl) {
	const { nextUrl, pageTitle } = await processPage(browser, currentUrl);

	if (pageTitle && pageTitle.includes('12月31日')) {
	console.log('Reached December 31st. Stopping after this page.');
	currentUrl = null;
	} else if (nextUrl) {
	console.log(`Next page found: ${nextUrl}`);
	currentUrl = nextUrl;
	} else {
	console.log('No more pages to process.');
	currentUrl = null;
	}
	}
	} catch (error) {
	console.error('An unexpected error occurred during the process:', error);
	} finally {
	console.log('Closing browser...');
	await browser.close();
	}
	}

	// --- Main Execution ---
	main();
No results found