Skip to content

Instantly share code, notes, and snippets.

@favstats
Created January 22, 2026 08:14
Show Gist options
  • Select an option

  • Save favstats/b9e7f9856687b808a303a27dfb927246 to your computer and use it in GitHub Desktop.

Select an option

Save favstats/b9e7f9856687b808a303a27dfb927246 to your computer and use it in GitHub Desktop.
# worker.R - Retrieve YT Metadata + Subtitle in R
# --- 1. LOAD LIBRARIES ---
library(readr)
library(dplyr)
library(jsonlite)
library(stringr)
library(httr)
# --- 2. HELPERS ---
`%||%` <- function(a, b) if (is.null(a) || length(a) == 0) b else a
force_all_cols_to_char <- function(df) {
df %>% dplyr::mutate(across(everything(), as.character))
}
# Robust YouTube ID extractor (watch, youtu.be, shorts, embed)
extract_youtube_id <- function(x) {
if (is.na(x) || !nzchar(x)) return(NA_character_)
x <- trimws(x)
m <- stringr::str_match(
x,
"(?i)(?:youtu\\.be/|youtube\\.com/(?:watch\\?v=|shorts/|embed/))([A-Za-z0-9_-]{11})"
)
m[, 2]
}
# Cross-platform command runner that writes stdout to a file
run_cmd_to_file <- function(bin, args, outfile, verbose = TRUE) {
dir.create(dirname(outfile), recursive = TRUE, showWarnings = FALSE)
if (file.exists(outfile)) file.remove(outfile)
# system2 works on Mac/Windows/Linux
# stdout/stderr can be file paths
status <- suppressWarnings(
system2(
command = bin,
args = args,
stdout = outfile,
stderr = if (verbose) "" else TRUE
)
)
# On success, status is 0; otherwise non-zero
invisible(status)
}
save_data_efficiently <- function(d, path, remove_duplicates = FALSE, ignore_columns = NULL) {
if (!file.exists(path)) {
dir_path <- dirname(path)
if (!dir.exists(dir_path)) dir.create(dir_path, recursive = TRUE)
readr::write_csv(force_all_cols_to_char(d), path)
return(invisible())
}
existing_cols <- names(readr::read_csv(path, n_max = 0, show_col_types = FALSE))
new_cols <- names(d)
is_schema_changing <- !all(new_cols %in% existing_cols)
d <- force_all_cols_to_char(d)
if (!is_schema_changing && !remove_duplicates) {
d_conformed <- d[, existing_cols]
readr::write_csv(d_conformed, path, append = TRUE)
} else {
old_data <- readr::read_csv(path, show_col_types = FALSE, col_types = cols(.default = "c"))
combined_data <- dplyr::bind_rows(old_data, d)
if (remove_duplicates) {
keep_columns <- if (!is.null(ignore_columns)) setdiff(names(combined_data), ignore_columns) else names(combined_data)
combined_data <- combined_data %>% dplyr::distinct(across(all_of(keep_columns)), .keep_all = TRUE)
}
readr::write_csv(combined_data, path)
}
invisible()
}
# Ensure transcript and JSON dirs exist
if (!dir.exists("transcripts")) dir.create("transcripts")
if (!dir.exists("yt_json")) dir.create("yt_json")
# --- 3. CONFIG (edit this section) ---
# Keep everything here explicit and simple.
#
# 1) Where yt-dlp and ffmpeg live
# On macOS (Homebrew), this usually just works as-is.
YTDLP_BIN <- "yt-dlp" # e.g., "/opt/homebrew/bin/yt-dlp" if needed
FFMPEG_DIR <- "" # e.g., "/opt/homebrew/bin" or leave empty to use PATH
# 2) Input + output
URL_COLUMN <- "video_url" # column name with YouTube URLs
OUTPUT_CSV <- "metadata.csv" # where metadata will be written
# 3) Transcript language
TRANSCRIPT_LANG <- "en"
# --- 4. VIDEO PROCESSING ---
process_and_save_video <- function(video_url, output_csv, transcript_lang = "en", verbose = TRUE) {
ytid <- extract_youtube_id(video_url)
if (is.na(ytid)) {
if (verbose) cat("⚠️ Not a recognized YouTube URL (skipping):", video_url, "\n")
return(invisible())
}
if (verbose) cat("▶️ Processing:", video_url, "\n")
json_path <- file.path("yt_json", paste0(ytid, ".json"))
# Build yt-dlp args (no shell redirection; stdout handled by system2)
args <- c(
if (nzchar(FFMPEG_DIR)) c("--ffmpeg-location", FFMPEG_DIR) else character(),
"--write-auto-subs",
"--sub-format", "vtt",
"--sub-lang", transcript_lang,
"--print-json",
"--skip-download",
video_url
)
if (verbose) {
cat("📦 Running yt-dlp:\n")
cat(" bin:", YTDLP_BIN, "\n")
cat(" args:", paste(args, collapse = " "), "\n")
}
status <- run_cmd_to_file(bin = YTDLP_BIN, args = args, outfile = json_path, verbose = verbose)
if (!file.exists(json_path) || file.size(json_path) == 0) {
if (verbose) cat("❌ yt-dlp output file not created (exit code:", status, ")\n\n")
return(invisible())
}
json_raw <- readLines(json_path, warn = FALSE)
json_string <- paste(json_raw, collapse = "\n")
if (!jsonlite::validate(json_string)) {
if (verbose) cat("⚠️ Malformed JSON. Skipping.\n\n")
return(invisible())
}
video_meta <- jsonlite::fromJSON(json_string, flatten = TRUE)
# --- Transcript Handling ---
if (!is.null(video_meta$requested_subtitles[[transcript_lang]])) {
transcript_url <- video_meta$requested_subtitles[[transcript_lang]]$url
if (verbose) cat("📄 Transcript URL found, downloading...\n")
tryCatch({
resp <- httr::GET(transcript_url)
lines <- httr::content(resp, as = "text", encoding = "UTF-8")
lines <- strsplit(lines, "\n")[[1]]
readr::write_lines(lines, file.path("transcripts", paste0(ytid, ".vtt")))
if (verbose) cat("✅ Transcript saved for:", ytid, "\n")
}, error = function(e) {
if (verbose) cat("❌ Transcript download failed:", conditionMessage(e), "\n")
})
} else {
if (verbose) cat("📭 No transcript found for:", ytid, "\n")
}
# --- Metadata Construction (FULL schema as in your original) ---
video_df <- data.frame(
id = video_meta$id %||% NA,
title = video_meta$title %||% NA,
fulltitle = video_meta$fulltitle %||% NA,
ext = video_meta$ext %||% NA,
alt_title = video_meta$alt_title %||% NA,
description = video_meta$description %||% NA,
display_id = video_meta$display_id %||% NA,
uploader = video_meta$uploader %||% NA,
uploader_id = video_meta$uploader_id %||% NA,
uploader_url = video_meta$uploader_url %||% NA,
license = video_meta$license %||% NA,
creators = paste(video_meta$creators, collapse = "; ") %||% NA,
creator = video_meta$creator %||% NA,
timestamp = video_meta$timestamp %||% NA,
upload_date = video_meta$upload_date %||% NA,
release_timestamp = video_meta$release_timestamp %||% NA,
release_date = video_meta$release_date %||% NA,
release_year = video_meta$release_year %||% NA,
modified_timestamp = video_meta$modified_timestamp %||% NA,
modified_date = video_meta$modified_date %||% NA,
channel = video_meta$channel %||% NA,
channel_id = video_meta$channel_id %||% NA,
channel_url = video_meta$channel_url %||% NA,
channel_follower_count = video_meta$channel_follower_count %||% NA,
channel_is_verified = video_meta$channel_is_verified %||% NA,
location = video_meta$location %||% NA,
duration = video_meta$duration %||% NA,
duration_string = video_meta$duration_string %||% NA,
view_count = video_meta$view_count %||% NA,
concurrent_view_count = video_meta$concurrent_view_count %||% NA,
like_count = video_meta$like_count %||% NA,
dislike_count = video_meta$dislike_count %||% NA,
repost_count = video_meta$repost_count %||% NA,
average_rating = video_meta$average_rating %||% NA,
comment_count = video_meta$comment_count %||% NA,
age_limit = video_meta$age_limit %||% NA,
live_status = video_meta$live_status %||% NA,
is_live = video_meta$is_live %||% NA,
was_live = video_meta$was_live %||% NA,
playable_in_embed = video_meta$playable_in_embed %||% NA,
availability = video_meta$availability %||% NA,
media_type = video_meta$media_type %||% NA,
start_time = video_meta$start_time %||% NA,
end_time = video_meta$end_time %||% NA,
extractor = video_meta$extractor %||% NA,
extractor_key = video_meta$extractor_key %||% NA,
epoch = video_meta$epoch %||% NA,
playlist_id = video_meta$playlist_id %||% NA,
playlist_title = video_meta$playlist_title %||% NA,
playlist = video_meta$playlist %||% NA,
playlist_count = video_meta$playlist_count %||% NA,
playlist_index = video_meta$playlist_index %||% NA,
playlist_uploader = video_meta$playlist_uploader %||% NA,
playlist_uploader_id = video_meta$playlist_uploader_id %||% NA,
playlist_channel = video_meta$playlist_channel %||% NA,
playlist_channel_id = video_meta$playlist_channel_id %||% NA,
playlist_webpage_url = video_meta$playlist_webpage_url %||% NA,
webpage_url = video_meta$webpage_url %||% NA,
webpage_url_basename = video_meta$webpage_url_basename %||% NA,
webpage_url_domain = video_meta$webpage_url_domain %||% NA,
original_url = video_meta$original_url %||% NA,
categories = paste(video_meta$categories, collapse = "; ") %||% NA,
tags = paste(video_meta$tags, collapse = "; ") %||% NA,
cast = paste(video_meta$cast, collapse = "; ") %||% NA
)
# keep the raw input url + ytid for easier debugging/joins
video_df$input_url <- video_url
video_df$ytid <- ytid
save_data_efficiently(d = video_df, path = output_csv)
if (verbose) cat("📅 Metadata saved for:", ytid, "\n\n")
invisible()
}
# --- 5. MAIN EXECUTION ---
data_with_urls <- tibble(video_url = "https://www.youtube.com/watch?v=BYOeamkqHtc")
data_with_urls <- force_all_cols_to_char(data_with_urls)
if (!URL_COLUMN %in% names(data_with_urls)) {
stop("Column not found: ", URL_COLUMN, call. = FALSE)
}
urls <- unique(na.omit(data_with_urls[[URL_COLUMN]]))
cat("URL column: ", URL_COLUMN, "\n", sep = "")
cat("Output CSV: ", OUTPUT_CSV, "\n", sep = "")
cat("Transcript language: ", TRANSCRIPT_LANG, "\n", sep = "")
cat("Videos to process: ", length(urls), "\n\n", sep = "")
# Run
for (u in urls) {
process_and_save_video(
video_url = u,
output_csv = OUTPUT_CSV,
transcript_lang = TRANSCRIPT_LANG,
verbose = TRUE
)
}
cat("Done.\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment