Skip to content

Instantly share code, notes, and snippets.

@Chrisbryan17
Forked from guillaume-rygn/Youtube transcript playlist
Last active November 1, 2024 07:27
Show Gist options
  • Select an option

  • Save Chrisbryan17/cb560633bd29ecfa449e51aee17511b1 to your computer and use it in GitHub Desktop.

Select an option

Save Chrisbryan17/cb560633bd29ecfa449e51aee17511b1 to your computer and use it in GitHub Desktop.
Youtube_transcript_playlist
import { YoutubeTranscript } from 'youtube-transcript';
import puppeteer from 'puppeteer';
import fs from 'fs/promises';
import readline from 'readline/promises';
import * as path from 'path';
const MAX_CHARS_PER_FILE = 500000;
async function main() {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
try {
const playlistUrl = await rl.question('Enter the URL of the YouTube playlist: ');
const browser = await puppeteer.launch({ args: ['--no-sandbox'] });
const page = await browser.newPage();
await page.goto(playlistUrl, { waitUntil: 'networkidle2' });
try {
await page.waitForSelector('ytd-playlist-video-renderer, #title h1 yt-formatted-string', { timeout: 60000 });
} catch (error) {
console.error('Error waiting for playlist items or title. Invalid playlist?', error);
return;
}
let playlistTitle = await page.evaluate(() => {
const titleElement = document.querySelector('#title h1 yt-formatted-string');
return titleElement ? titleElement.textContent.replaceAll(/\s+/g, "_").trim() : 'Untitled_Playlist';
});
// Sanitize playlistTitle for filenames
let safeTitle = playlistTitle.replace(/[/\\?%*:|"<>]/g, '-');
if (!safeTitle || !safeTitle.trim()) {
safeTitle = "Untitled_Playlist";
}
const videos = await page.evaluate(() => {
const videoData = [];
const playlistItems = document.querySelectorAll('ytd-playlist-video-renderer');
playlistItems.forEach(item => {
const link = item.querySelector('a#video-title');
if (link) {
const url = link.href;
const title = link.textContent.trim();
videoData.push({ url, title });
}
});
return videoData;
});
await browser.close();
const finalDoc = [];
let currentFileIndex = 1;
let currentFileContent = [];
let currentCharCount = 0;
for (const element of videos) {
try {
const transcript = await YoutubeTranscript.fetchTranscript(element.url, {
lang: 'en',
enableAutoGeneratedCaptions: true
});
const allText = transcript.map(item => item.text).join(' ');
const videoObj = { url: element.url, title: element.title, transcription: allText };
const videoJson = JSON.stringify(videoObj);
if (currentCharCount + videoJson.length > MAX_CHARS_PER_FILE) {
finalDoc.push({
filename: `${safeTitle}_part${currentFileIndex}.txt`,
data: currentFileContent
});
currentFileIndex++;
currentFileContent = [];
currentCharCount = 0;
}
currentFileContent.push(videoObj);
currentCharCount += videoJson.length;
console.log(`Transcription fetched for ${element.title}`);
} catch (error) {
console.error(`Error fetching transcript for ${element.url}:`, error);
currentFileContent.push({
url: element.url,
title: element.title,
transcription: 'Error fetching transcript'
});
}
}
// Save the last file
if (currentFileContent.length > 0) {
finalDoc.push({
filename: `${safeTitle}_part${currentFileIndex}.txt`,
data: currentFileContent
});
}
const outputDir = await rl.question('Enter output directory (leave blank for current directory): ');
const safeOutputDir = outputDir.trim() || '.';
for (const fileData of finalDoc) {
if (!fileData || !fileData.filename) {
console.error('Invalid file data:', fileData);
continue;
}
const filePath = path.join(safeOutputDir, fileData.filename);
const data = JSON.stringify(fileData.data, null, 2);
try {
await fs.mkdir(path.dirname(filePath), { recursive: true });
await fs.writeFile(filePath, data);
console.log(`File ${filePath} created successfully`);
} catch (err) {
console.error(`Error writing file ${filePath}:`, err);
}
}
} catch (error) {
console.error('An unexpected error occurred:', error);
} finally {
rl.close();
}
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment