Created
January 22, 2026 08:14
-
-
Save favstats/b9e7f9856687b808a303a27dfb927246 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # worker.R - Retrieve YT Metadata + Subtitle in R | |
| # --- 1. LOAD LIBRARIES --- | |
| library(readr) | |
| library(dplyr) | |
| library(jsonlite) | |
| library(stringr) | |
| library(httr) | |
| # --- 2. HELPERS --- | |
| `%||%` <- function(a, b) if (is.null(a) || length(a) == 0) b else a | |
| force_all_cols_to_char <- function(df) { | |
| df %>% dplyr::mutate(across(everything(), as.character)) | |
| } | |
| # Robust YouTube ID extractor (watch, youtu.be, shorts, embed) | |
| extract_youtube_id <- function(x) { | |
| if (is.na(x) || !nzchar(x)) return(NA_character_) | |
| x <- trimws(x) | |
| m <- stringr::str_match( | |
| x, | |
| "(?i)(?:youtu\\.be/|youtube\\.com/(?:watch\\?v=|shorts/|embed/))([A-Za-z0-9_-]{11})" | |
| ) | |
| m[, 2] | |
| } | |
| # Cross-platform command runner that writes stdout to a file | |
| run_cmd_to_file <- function(bin, args, outfile, verbose = TRUE) { | |
| dir.create(dirname(outfile), recursive = TRUE, showWarnings = FALSE) | |
| if (file.exists(outfile)) file.remove(outfile) | |
| # system2 works on Mac/Windows/Linux | |
| # stdout/stderr can be file paths | |
| status <- suppressWarnings( | |
| system2( | |
| command = bin, | |
| args = args, | |
| stdout = outfile, | |
| stderr = if (verbose) "" else TRUE | |
| ) | |
| ) | |
| # On success, status is 0; otherwise non-zero | |
| invisible(status) | |
| } | |
| save_data_efficiently <- function(d, path, remove_duplicates = FALSE, ignore_columns = NULL) { | |
| if (!file.exists(path)) { | |
| dir_path <- dirname(path) | |
| if (!dir.exists(dir_path)) dir.create(dir_path, recursive = TRUE) | |
| readr::write_csv(force_all_cols_to_char(d), path) | |
| return(invisible()) | |
| } | |
| existing_cols <- names(readr::read_csv(path, n_max = 0, show_col_types = FALSE)) | |
| new_cols <- names(d) | |
| is_schema_changing <- !all(new_cols %in% existing_cols) | |
| d <- force_all_cols_to_char(d) | |
| if (!is_schema_changing && !remove_duplicates) { | |
| d_conformed <- d[, existing_cols] | |
| readr::write_csv(d_conformed, path, append = TRUE) | |
| } else { | |
| old_data <- readr::read_csv(path, show_col_types = FALSE, col_types = cols(.default = "c")) | |
| combined_data <- dplyr::bind_rows(old_data, d) | |
| if (remove_duplicates) { | |
| keep_columns <- if (!is.null(ignore_columns)) setdiff(names(combined_data), ignore_columns) else names(combined_data) | |
| combined_data <- combined_data %>% dplyr::distinct(across(all_of(keep_columns)), .keep_all = TRUE) | |
| } | |
| readr::write_csv(combined_data, path) | |
| } | |
| invisible() | |
| } | |
| # Ensure transcript and JSON dirs exist | |
| if (!dir.exists("transcripts")) dir.create("transcripts") | |
| if (!dir.exists("yt_json")) dir.create("yt_json") | |
| # --- 3. CONFIG (edit this section) --- | |
| # Keep everything here explicit and simple. | |
| # | |
| # 1) Where yt-dlp and ffmpeg live | |
| # On macOS (Homebrew), this usually just works as-is. | |
| YTDLP_BIN <- "yt-dlp" # e.g., "/opt/homebrew/bin/yt-dlp" if needed | |
| FFMPEG_DIR <- "" # e.g., "/opt/homebrew/bin" or leave empty to use PATH | |
| # 2) Input + output | |
| URL_COLUMN <- "video_url" # column name with YouTube URLs | |
| OUTPUT_CSV <- "metadata.csv" # where metadata will be written | |
| # 3) Transcript language | |
| TRANSCRIPT_LANG <- "en" | |
| # --- 4. VIDEO PROCESSING --- | |
| process_and_save_video <- function(video_url, output_csv, transcript_lang = "en", verbose = TRUE) { | |
| ytid <- extract_youtube_id(video_url) | |
| if (is.na(ytid)) { | |
| if (verbose) cat("⚠️ Not a recognized YouTube URL (skipping):", video_url, "\n") | |
| return(invisible()) | |
| } | |
| if (verbose) cat("▶️ Processing:", video_url, "\n") | |
| json_path <- file.path("yt_json", paste0(ytid, ".json")) | |
| # Build yt-dlp args (no shell redirection; stdout handled by system2) | |
| args <- c( | |
| if (nzchar(FFMPEG_DIR)) c("--ffmpeg-location", FFMPEG_DIR) else character(), | |
| "--write-auto-subs", | |
| "--sub-format", "vtt", | |
| "--sub-lang", transcript_lang, | |
| "--print-json", | |
| "--skip-download", | |
| video_url | |
| ) | |
| if (verbose) { | |
| cat("📦 Running yt-dlp:\n") | |
| cat(" bin:", YTDLP_BIN, "\n") | |
| cat(" args:", paste(args, collapse = " "), "\n") | |
| } | |
| status <- run_cmd_to_file(bin = YTDLP_BIN, args = args, outfile = json_path, verbose = verbose) | |
| if (!file.exists(json_path) || file.size(json_path) == 0) { | |
| if (verbose) cat("❌ yt-dlp output file not created (exit code:", status, ")\n\n") | |
| return(invisible()) | |
| } | |
| json_raw <- readLines(json_path, warn = FALSE) | |
| json_string <- paste(json_raw, collapse = "\n") | |
| if (!jsonlite::validate(json_string)) { | |
| if (verbose) cat("⚠️ Malformed JSON. Skipping.\n\n") | |
| return(invisible()) | |
| } | |
| video_meta <- jsonlite::fromJSON(json_string, flatten = TRUE) | |
| # --- Transcript Handling --- | |
| if (!is.null(video_meta$requested_subtitles[[transcript_lang]])) { | |
| transcript_url <- video_meta$requested_subtitles[[transcript_lang]]$url | |
| if (verbose) cat("📄 Transcript URL found, downloading...\n") | |
| tryCatch({ | |
| resp <- httr::GET(transcript_url) | |
| lines <- httr::content(resp, as = "text", encoding = "UTF-8") | |
| lines <- strsplit(lines, "\n")[[1]] | |
| readr::write_lines(lines, file.path("transcripts", paste0(ytid, ".vtt"))) | |
| if (verbose) cat("✅ Transcript saved for:", ytid, "\n") | |
| }, error = function(e) { | |
| if (verbose) cat("❌ Transcript download failed:", conditionMessage(e), "\n") | |
| }) | |
| } else { | |
| if (verbose) cat("📭 No transcript found for:", ytid, "\n") | |
| } | |
| # --- Metadata Construction (FULL schema as in your original) --- | |
| video_df <- data.frame( | |
| id = video_meta$id %||% NA, | |
| title = video_meta$title %||% NA, | |
| fulltitle = video_meta$fulltitle %||% NA, | |
| ext = video_meta$ext %||% NA, | |
| alt_title = video_meta$alt_title %||% NA, | |
| description = video_meta$description %||% NA, | |
| display_id = video_meta$display_id %||% NA, | |
| uploader = video_meta$uploader %||% NA, | |
| uploader_id = video_meta$uploader_id %||% NA, | |
| uploader_url = video_meta$uploader_url %||% NA, | |
| license = video_meta$license %||% NA, | |
| creators = paste(video_meta$creators, collapse = "; ") %||% NA, | |
| creator = video_meta$creator %||% NA, | |
| timestamp = video_meta$timestamp %||% NA, | |
| upload_date = video_meta$upload_date %||% NA, | |
| release_timestamp = video_meta$release_timestamp %||% NA, | |
| release_date = video_meta$release_date %||% NA, | |
| release_year = video_meta$release_year %||% NA, | |
| modified_timestamp = video_meta$modified_timestamp %||% NA, | |
| modified_date = video_meta$modified_date %||% NA, | |
| channel = video_meta$channel %||% NA, | |
| channel_id = video_meta$channel_id %||% NA, | |
| channel_url = video_meta$channel_url %||% NA, | |
| channel_follower_count = video_meta$channel_follower_count %||% NA, | |
| channel_is_verified = video_meta$channel_is_verified %||% NA, | |
| location = video_meta$location %||% NA, | |
| duration = video_meta$duration %||% NA, | |
| duration_string = video_meta$duration_string %||% NA, | |
| view_count = video_meta$view_count %||% NA, | |
| concurrent_view_count = video_meta$concurrent_view_count %||% NA, | |
| like_count = video_meta$like_count %||% NA, | |
| dislike_count = video_meta$dislike_count %||% NA, | |
| repost_count = video_meta$repost_count %||% NA, | |
| average_rating = video_meta$average_rating %||% NA, | |
| comment_count = video_meta$comment_count %||% NA, | |
| age_limit = video_meta$age_limit %||% NA, | |
| live_status = video_meta$live_status %||% NA, | |
| is_live = video_meta$is_live %||% NA, | |
| was_live = video_meta$was_live %||% NA, | |
| playable_in_embed = video_meta$playable_in_embed %||% NA, | |
| availability = video_meta$availability %||% NA, | |
| media_type = video_meta$media_type %||% NA, | |
| start_time = video_meta$start_time %||% NA, | |
| end_time = video_meta$end_time %||% NA, | |
| extractor = video_meta$extractor %||% NA, | |
| extractor_key = video_meta$extractor_key %||% NA, | |
| epoch = video_meta$epoch %||% NA, | |
| playlist_id = video_meta$playlist_id %||% NA, | |
| playlist_title = video_meta$playlist_title %||% NA, | |
| playlist = video_meta$playlist %||% NA, | |
| playlist_count = video_meta$playlist_count %||% NA, | |
| playlist_index = video_meta$playlist_index %||% NA, | |
| playlist_uploader = video_meta$playlist_uploader %||% NA, | |
| playlist_uploader_id = video_meta$playlist_uploader_id %||% NA, | |
| playlist_channel = video_meta$playlist_channel %||% NA, | |
| playlist_channel_id = video_meta$playlist_channel_id %||% NA, | |
| playlist_webpage_url = video_meta$playlist_webpage_url %||% NA, | |
| webpage_url = video_meta$webpage_url %||% NA, | |
| webpage_url_basename = video_meta$webpage_url_basename %||% NA, | |
| webpage_url_domain = video_meta$webpage_url_domain %||% NA, | |
| original_url = video_meta$original_url %||% NA, | |
| categories = paste(video_meta$categories, collapse = "; ") %||% NA, | |
| tags = paste(video_meta$tags, collapse = "; ") %||% NA, | |
| cast = paste(video_meta$cast, collapse = "; ") %||% NA | |
| ) | |
| # keep the raw input url + ytid for easier debugging/joins | |
| video_df$input_url <- video_url | |
| video_df$ytid <- ytid | |
| save_data_efficiently(d = video_df, path = output_csv) | |
| if (verbose) cat("📅 Metadata saved for:", ytid, "\n\n") | |
| invisible() | |
| } | |
| # --- 5. MAIN EXECUTION --- | |
| data_with_urls <- tibble(video_url = "https://www.youtube.com/watch?v=BYOeamkqHtc") | |
| data_with_urls <- force_all_cols_to_char(data_with_urls) | |
| if (!URL_COLUMN %in% names(data_with_urls)) { | |
| stop("Column not found: ", URL_COLUMN, call. = FALSE) | |
| } | |
| urls <- unique(na.omit(data_with_urls[[URL_COLUMN]])) | |
| cat("URL column: ", URL_COLUMN, "\n", sep = "") | |
| cat("Output CSV: ", OUTPUT_CSV, "\n", sep = "") | |
| cat("Transcript language: ", TRANSCRIPT_LANG, "\n", sep = "") | |
| cat("Videos to process: ", length(urls), "\n\n", sep = "") | |
| # Run | |
| for (u in urls) { | |
| process_and_save_video( | |
| video_url = u, | |
| output_csv = OUTPUT_CSV, | |
| transcript_lang = TRANSCRIPT_LANG, | |
| verbose = TRUE | |
| ) | |
| } | |
| cat("Done.\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment