Forked from guillaume-rygn/Youtube transcript playlist
Last active
November 1, 2024 07:27
-
-
Save Chrisbryan17/cb560633bd29ecfa449e51aee17511b1 to your computer and use it in GitHub Desktop.
Youtube_transcript_playlist
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import { YoutubeTranscript } from 'youtube-transcript'; | |
| import puppeteer from 'puppeteer'; | |
| import fs from 'fs/promises'; | |
| import readline from 'readline/promises'; | |
| import * as path from 'path'; | |
| const MAX_CHARS_PER_FILE = 500000; | |
| async function main() { | |
| const rl = readline.createInterface({ | |
| input: process.stdin, | |
| output: process.stdout | |
| }); | |
| try { | |
| const playlistUrl = await rl.question('Enter the URL of the YouTube playlist: '); | |
| const browser = await puppeteer.launch({ args: ['--no-sandbox'] }); | |
| const page = await browser.newPage(); | |
| await page.goto(playlistUrl, { waitUntil: 'networkidle2' }); | |
| try { | |
| await page.waitForSelector('ytd-playlist-video-renderer, #title h1 yt-formatted-string', { timeout: 60000 }); | |
| } catch (error) { | |
| console.error('Error waiting for playlist items or title. Invalid playlist?', error); | |
| return; | |
| } | |
| let playlistTitle = await page.evaluate(() => { | |
| const titleElement = document.querySelector('#title h1 yt-formatted-string'); | |
| return titleElement ? titleElement.textContent.replaceAll(/\s+/g, "_").trim() : 'Untitled_Playlist'; | |
| }); | |
| // Sanitize playlistTitle for filenames | |
| let safeTitle = playlistTitle.replace(/[/\\?%*:|"<>]/g, '-'); | |
| if (!safeTitle || !safeTitle.trim()) { | |
| safeTitle = "Untitled_Playlist"; | |
| } | |
| const videos = await page.evaluate(() => { | |
| const videoData = []; | |
| const playlistItems = document.querySelectorAll('ytd-playlist-video-renderer'); | |
| playlistItems.forEach(item => { | |
| const link = item.querySelector('a#video-title'); | |
| if (link) { | |
| const url = link.href; | |
| const title = link.textContent.trim(); | |
| videoData.push({ url, title }); | |
| } | |
| }); | |
| return videoData; | |
| }); | |
| await browser.close(); | |
| const finalDoc = []; | |
| let currentFileIndex = 1; | |
| let currentFileContent = []; | |
| let currentCharCount = 0; | |
| for (const element of videos) { | |
| try { | |
| const transcript = await YoutubeTranscript.fetchTranscript(element.url, { | |
| lang: 'en', | |
| enableAutoGeneratedCaptions: true | |
| }); | |
| const allText = transcript.map(item => item.text).join(' '); | |
| const videoObj = { url: element.url, title: element.title, transcription: allText }; | |
| const videoJson = JSON.stringify(videoObj); | |
| if (currentCharCount + videoJson.length > MAX_CHARS_PER_FILE) { | |
| finalDoc.push({ | |
| filename: `${safeTitle}_part${currentFileIndex}.txt`, | |
| data: currentFileContent | |
| }); | |
| currentFileIndex++; | |
| currentFileContent = []; | |
| currentCharCount = 0; | |
| } | |
| currentFileContent.push(videoObj); | |
| currentCharCount += videoJson.length; | |
| console.log(`Transcription fetched for ${element.title}`); | |
| } catch (error) { | |
| console.error(`Error fetching transcript for ${element.url}:`, error); | |
| currentFileContent.push({ | |
| url: element.url, | |
| title: element.title, | |
| transcription: 'Error fetching transcript' | |
| }); | |
| } | |
| } | |
| // Save the last file | |
| if (currentFileContent.length > 0) { | |
| finalDoc.push({ | |
| filename: `${safeTitle}_part${currentFileIndex}.txt`, | |
| data: currentFileContent | |
| }); | |
| } | |
| const outputDir = await rl.question('Enter output directory (leave blank for current directory): '); | |
| const safeOutputDir = outputDir.trim() || '.'; | |
| for (const fileData of finalDoc) { | |
| if (!fileData || !fileData.filename) { | |
| console.error('Invalid file data:', fileData); | |
| continue; | |
| } | |
| const filePath = path.join(safeOutputDir, fileData.filename); | |
| const data = JSON.stringify(fileData.data, null, 2); | |
| try { | |
| await fs.mkdir(path.dirname(filePath), { recursive: true }); | |
| await fs.writeFile(filePath, data); | |
| console.log(`File ${filePath} created successfully`); | |
| } catch (err) { | |
| console.error(`Error writing file ${filePath}:`, err); | |
| } | |
| } | |
| } catch (error) { | |
| console.error('An unexpected error occurred:', error); | |
| } finally { | |
| rl.close(); | |
| } | |
| } | |
| main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment