favstats/retrieve_yt_metadata.R

## retrieve_yt_metadata.R
# worker.R - Retrieve YT Metadata + Subtitle in R

# --- 1. LOAD LIBRARIES ---
library(readr)
library(dplyr)
library(jsonlite)
library(stringr)
library(httr)

# --- 2. HELPERS ---
`%||%` <- function(a, b) if (is.null(a) || length(a) == 0) b else a

force_all_cols_to_char <- function(df) {
  df %>% dplyr::mutate(across(everything(), as.character))
}

# Robust YouTube ID extractor (watch, youtu.be, shorts, embed)
extract_youtube_id <- function(x) {
  if (is.na(x) || !nzchar(x)) return(NA_character_)
  x <- trimws(x)
  m <- stringr::str_match(
    x,
    "(?i)(?:youtu\\.be/|youtube\\.com/(?:watch\\?v=|shorts/|embed/))([A-Za-z0-9_-]{11})"
  )
  m[, 2]
}

# Cross-platform command runner that writes stdout to a file
run_cmd_to_file <- function(bin, args, outfile, verbose = TRUE) {
  dir.create(dirname(outfile), recursive = TRUE, showWarnings = FALSE)
  if (file.exists(outfile)) file.remove(outfile)

  # system2 works on Mac/Windows/Linux
  # stdout/stderr can be file paths
  status <- suppressWarnings(
    system2(
      command = bin,
      args = args,
      stdout = outfile,
      stderr = if (verbose) "" else TRUE
    )
  )

  # On success, status is 0; otherwise non-zero
  invisible(status)
}

save_data_efficiently <- function(d, path, remove_duplicates = FALSE, ignore_columns = NULL) {
  if (!file.exists(path)) {
    dir_path <- dirname(path)
    if (!dir.exists(dir_path)) dir.create(dir_path, recursive = TRUE)
    readr::write_csv(force_all_cols_to_char(d), path)
    return(invisible())
  }

  existing_cols <- names(readr::read_csv(path, n_max = 0, show_col_types = FALSE))
  new_cols <- names(d)
  is_schema_changing <- !all(new_cols %in% existing_cols)
  d <- force_all_cols_to_char(d)

  if (!is_schema_changing && !remove_duplicates) {
    d_conformed <- d[, existing_cols]
    readr::write_csv(d_conformed, path, append = TRUE)
  } else {
    old_data <- readr::read_csv(path, show_col_types = FALSE, col_types = cols(.default = "c"))
    combined_data <- dplyr::bind_rows(old_data, d)

    if (remove_duplicates) {
      keep_columns <- if (!is.null(ignore_columns)) setdiff(names(combined_data), ignore_columns) else names(combined_data)
      combined_data <- combined_data %>% dplyr::distinct(across(all_of(keep_columns)), .keep_all = TRUE)
    }
    readr::write_csv(combined_data, path)
  }
  invisible()
}

# Ensure transcript and JSON dirs exist
if (!dir.exists("transcripts")) dir.create("transcripts")
if (!dir.exists("yt_json")) dir.create("yt_json")

# --- 3. CONFIG (edit this section) ---
# Keep everything here explicit and simple.
#
# 1) Where yt-dlp and ffmpeg live
#    On macOS (Homebrew), this usually just works as-is.
YTDLP_BIN <- "yt-dlp"          # e.g., "/opt/homebrew/bin/yt-dlp" if needed
FFMPEG_DIR <- ""               # e.g., "/opt/homebrew/bin" or leave empty to use PATH

# 2) Input + output
URL_COLUMN <- "video_url"      # column name with YouTube URLs
OUTPUT_CSV <- "metadata.csv"   # where metadata will be written

# 3) Transcript language
TRANSCRIPT_LANG <- "en"

# --- 4. VIDEO PROCESSING ---
process_and_save_video <- function(video_url, output_csv, transcript_lang = "en", verbose = TRUE) {
  ytid <- extract_youtube_id(video_url)

  if (is.na(ytid)) {
    if (verbose) cat("⚠️  Not a recognized YouTube URL (skipping):", video_url, "\n")
    return(invisible())
  }

  if (verbose) cat("▶️  Processing:", video_url, "\n")

  json_path <- file.path("yt_json", paste0(ytid, ".json"))

  # Build yt-dlp args (no shell redirection; stdout handled by system2)
  args <- c(
    if (nzchar(FFMPEG_DIR)) c("--ffmpeg-location", FFMPEG_DIR) else character(),
    "--write-auto-subs",
    "--sub-format", "vtt",
    "--sub-lang", transcript_lang,
    "--print-json",
    "--skip-download",
    video_url
  )

  if (verbose) {
    cat("📦 Running yt-dlp:\n")
    cat("   bin:", YTDLP_BIN, "\n")
    cat("   args:", paste(args, collapse = " "), "\n")
  }

  status <- run_cmd_to_file(bin = YTDLP_BIN, args = args, outfile = json_path, verbose = verbose)

  if (!file.exists(json_path) || file.size(json_path) == 0) {
    if (verbose) cat("❌ yt-dlp output file not created (exit code:", status, ")\n\n")
    return(invisible())
  }

  json_raw <- readLines(json_path, warn = FALSE)
  json_string <- paste(json_raw, collapse = "\n")

  if (!jsonlite::validate(json_string)) {
    if (verbose) cat("⚠️ Malformed JSON. Skipping.\n\n")
    return(invisible())
  }

  video_meta <- jsonlite::fromJSON(json_string, flatten = TRUE)

  # --- Transcript Handling ---
  if (!is.null(video_meta$requested_subtitles[[transcript_lang]])) {
    transcript_url <- video_meta$requested_subtitles[[transcript_lang]]$url
    if (verbose) cat("📄 Transcript URL found, downloading...\n")
    tryCatch({
      resp <- httr::GET(transcript_url)
      lines <- httr::content(resp, as = "text", encoding = "UTF-8")
      lines <- strsplit(lines, "\n")[[1]]
      readr::write_lines(lines, file.path("transcripts", paste0(ytid, ".vtt")))
      if (verbose) cat("✅ Transcript saved for:", ytid, "\n")
    }, error = function(e) {
      if (verbose) cat("❌ Transcript download failed:", conditionMessage(e), "\n")
    })
  } else {
    if (verbose) cat("📭 No transcript found for:", ytid, "\n")
  }

  # --- Metadata Construction (FULL schema as in your original) ---
  video_df <- data.frame(
    id = video_meta$id %||% NA,
    title = video_meta$title %||% NA,
    fulltitle = video_meta$fulltitle %||% NA,
    ext = video_meta$ext %||% NA,
    alt_title = video_meta$alt_title %||% NA,
    description = video_meta$description %||% NA,
    display_id = video_meta$display_id %||% NA,
    uploader = video_meta$uploader %||% NA,
    uploader_id = video_meta$uploader_id %||% NA,
    uploader_url = video_meta$uploader_url %||% NA,
    license = video_meta$license %||% NA,
    creators = paste(video_meta$creators, collapse = "; ") %||% NA,
    creator = video_meta$creator %||% NA,
    timestamp = video_meta$timestamp %||% NA,
    upload_date = video_meta$upload_date %||% NA,
    release_timestamp = video_meta$release_timestamp %||% NA,
    release_date = video_meta$release_date %||% NA,
    release_year = video_meta$release_year %||% NA,
    modified_timestamp = video_meta$modified_timestamp %||% NA,
    modified_date = video_meta$modified_date %||% NA,
    channel = video_meta$channel %||% NA,
    channel_id = video_meta$channel_id %||% NA,
    channel_url = video_meta$channel_url %||% NA,
    channel_follower_count = video_meta$channel_follower_count %||% NA,
    channel_is_verified = video_meta$channel_is_verified %||% NA,
    location = video_meta$location %||% NA,
    duration = video_meta$duration %||% NA,
    duration_string = video_meta$duration_string %||% NA,
    view_count = video_meta$view_count %||% NA,
    concurrent_view_count = video_meta$concurrent_view_count %||% NA,
    like_count = video_meta$like_count %||% NA,
    dislike_count = video_meta$dislike_count %||% NA,
    repost_count = video_meta$repost_count %||% NA,
    average_rating = video_meta$average_rating %||% NA,
    comment_count = video_meta$comment_count %||% NA,
    age_limit = video_meta$age_limit %||% NA,
    live_status = video_meta$live_status %||% NA,
    is_live = video_meta$is_live %||% NA,
    was_live = video_meta$was_live %||% NA,
    playable_in_embed = video_meta$playable_in_embed %||% NA,
    availability = video_meta$availability %||% NA,
    media_type = video_meta$media_type %||% NA,
    start_time = video_meta$start_time %||% NA,
    end_time = video_meta$end_time %||% NA,
    extractor = video_meta$extractor %||% NA,
    extractor_key = video_meta$extractor_key %||% NA,
    epoch = video_meta$epoch %||% NA,
    playlist_id = video_meta$playlist_id %||% NA,
    playlist_title = video_meta$playlist_title %||% NA,
    playlist = video_meta$playlist %||% NA,
    playlist_count = video_meta$playlist_count %||% NA,
    playlist_index = video_meta$playlist_index %||% NA,
    playlist_uploader = video_meta$playlist_uploader %||% NA,
    playlist_uploader_id = video_meta$playlist_uploader_id %||% NA,
    playlist_channel = video_meta$playlist_channel %||% NA,
    playlist_channel_id = video_meta$playlist_channel_id %||% NA,
    playlist_webpage_url = video_meta$playlist_webpage_url %||% NA,
    webpage_url = video_meta$webpage_url %||% NA,
    webpage_url_basename = video_meta$webpage_url_basename %||% NA,
    webpage_url_domain = video_meta$webpage_url_domain %||% NA,
    original_url = video_meta$original_url %||% NA,
    categories = paste(video_meta$categories, collapse = "; ") %||% NA,
    tags = paste(video_meta$tags, collapse = "; ") %||% NA,
    cast = paste(video_meta$cast, collapse = "; ") %||% NA
  )

  # keep the raw input url + ytid for easier debugging/joins
  video_df$input_url <- video_url
  video_df$ytid <- ytid

  save_data_efficiently(d = video_df, path = output_csv)

  if (verbose) cat("📅 Metadata saved for:", ytid, "\n\n")
  invisible()
}

# --- 5. MAIN EXECUTION ---

data_with_urls <- tibble(video_url = "https://www.youtube.com/watch?v=BYOeamkqHtc")

data_with_urls <- force_all_cols_to_char(data_with_urls)

if (!URL_COLUMN %in% names(data_with_urls)) {
  stop("Column not found: ", URL_COLUMN, call. = FALSE)
}

urls <- unique(na.omit(data_with_urls[[URL_COLUMN]]))

cat("URL column: ", URL_COLUMN, "\n", sep = "")
cat("Output CSV: ", OUTPUT_CSV, "\n", sep = "")
cat("Transcript language: ", TRANSCRIPT_LANG, "\n", sep = "")
cat("Videos to process: ", length(urls), "\n\n", sep = "")

# Run
for (u in urls) {
  process_and_save_video(
    video_url = u,
    output_csv = OUTPUT_CSV,
    transcript_lang = TRANSCRIPT_LANG,
    verbose = TRUE
  )
}

cat("Done.\n")
	# worker.R - Retrieve YT Metadata + Subtitle in R

	# --- 1. LOAD LIBRARIES ---
	library(readr)
	library(dplyr)
	library(jsonlite)
	library(stringr)
	library(httr)

	# --- 2. HELPERS ---
	`%\|\|%` <- function(a, b) if (is.null(a) \|\| length(a) == 0) b else a

	force_all_cols_to_char <- function(df) {
	df %>% dplyr::mutate(across(everything(), as.character))
	}

	# Robust YouTube ID extractor (watch, youtu.be, shorts, embed)
	extract_youtube_id <- function(x) {
	if (is.na(x) \|\| !nzchar(x)) return(NA_character_)
	x <- trimws(x)
	m <- stringr::str_match(
	x,
	"(?i)(?:youtu\\.be/\|youtube\\.com/(?:watch\\?v=\|shorts/\|embed/))([A-Za-z0-9_-]{11})"
	)
	m[, 2]
	}

	# Cross-platform command runner that writes stdout to a file
	run_cmd_to_file <- function(bin, args, outfile, verbose = TRUE) {
	dir.create(dirname(outfile), recursive = TRUE, showWarnings = FALSE)
	if (file.exists(outfile)) file.remove(outfile)

	# system2 works on Mac/Windows/Linux
	# stdout/stderr can be file paths
	status <- suppressWarnings(
	system2(
	command = bin,
	args = args,
	stdout = outfile,
	stderr = if (verbose) "" else TRUE
	)
	)

	# On success, status is 0; otherwise non-zero
	invisible(status)
	}

	save_data_efficiently <- function(d, path, remove_duplicates = FALSE, ignore_columns = NULL) {
	if (!file.exists(path)) {
	dir_path <- dirname(path)
	if (!dir.exists(dir_path)) dir.create(dir_path, recursive = TRUE)
	readr::write_csv(force_all_cols_to_char(d), path)
	return(invisible())
	}

	existing_cols <- names(readr::read_csv(path, n_max = 0, show_col_types = FALSE))
	new_cols <- names(d)
	is_schema_changing <- !all(new_cols %in% existing_cols)
	d <- force_all_cols_to_char(d)

	if (!is_schema_changing && !remove_duplicates) {
	d_conformed <- d[, existing_cols]
	readr::write_csv(d_conformed, path, append = TRUE)
	} else {
	old_data <- readr::read_csv(path, show_col_types = FALSE, col_types = cols(.default = "c"))
	combined_data <- dplyr::bind_rows(old_data, d)

	if (remove_duplicates) {
	keep_columns <- if (!is.null(ignore_columns)) setdiff(names(combined_data), ignore_columns) else names(combined_data)
	combined_data <- combined_data %>% dplyr::distinct(across(all_of(keep_columns)), .keep_all = TRUE)
	}
	readr::write_csv(combined_data, path)
	}
	invisible()
	}

	# Ensure transcript and JSON dirs exist
	if (!dir.exists("transcripts")) dir.create("transcripts")
	if (!dir.exists("yt_json")) dir.create("yt_json")

	# --- 3. CONFIG (edit this section) ---
	# Keep everything here explicit and simple.
	#
	# 1) Where yt-dlp and ffmpeg live
	# On macOS (Homebrew), this usually just works as-is.
	YTDLP_BIN <- "yt-dlp" # e.g., "/opt/homebrew/bin/yt-dlp" if needed
	FFMPEG_DIR <- "" # e.g., "/opt/homebrew/bin" or leave empty to use PATH

	# 2) Input + output
	URL_COLUMN <- "video_url" # column name with YouTube URLs
	OUTPUT_CSV <- "metadata.csv" # where metadata will be written

	# 3) Transcript language
	TRANSCRIPT_LANG <- "en"

	# --- 4. VIDEO PROCESSING ---
	process_and_save_video <- function(video_url, output_csv, transcript_lang = "en", verbose = TRUE) {
	ytid <- extract_youtube_id(video_url)

	if (is.na(ytid)) {
	if (verbose) cat("⚠️ Not a recognized YouTube URL (skipping):", video_url, "\n")
	return(invisible())
	}

	if (verbose) cat("▶️ Processing:", video_url, "\n")

	json_path <- file.path("yt_json", paste0(ytid, ".json"))

	# Build yt-dlp args (no shell redirection; stdout handled by system2)
	args <- c(
	if (nzchar(FFMPEG_DIR)) c("--ffmpeg-location", FFMPEG_DIR) else character(),
	"--write-auto-subs",
	"--sub-format", "vtt",
	"--sub-lang", transcript_lang,
	"--print-json",
	"--skip-download",
	video_url
	)

	if (verbose) {
	cat("📦 Running yt-dlp:\n")
	cat(" bin:", YTDLP_BIN, "\n")
	cat(" args:", paste(args, collapse = " "), "\n")
	}

	status <- run_cmd_to_file(bin = YTDLP_BIN, args = args, outfile = json_path, verbose = verbose)

	if (!file.exists(json_path) \|\| file.size(json_path) == 0) {
	if (verbose) cat("❌ yt-dlp output file not created (exit code:", status, ")\n\n")
	return(invisible())
	}

	json_raw <- readLines(json_path, warn = FALSE)
	json_string <- paste(json_raw, collapse = "\n")

	if (!jsonlite::validate(json_string)) {
	if (verbose) cat("⚠️ Malformed JSON. Skipping.\n\n")
	return(invisible())
	}

	video_meta <- jsonlite::fromJSON(json_string, flatten = TRUE)

	# --- Transcript Handling ---
	if (!is.null(video_meta$requested_subtitles[[transcript_lang]])) {
	transcript_url <- video_meta$requested_subtitles[[transcript_lang]]$url
	if (verbose) cat("📄 Transcript URL found, downloading...\n")
	tryCatch({
	resp <- httr::GET(transcript_url)
	lines <- httr::content(resp, as = "text", encoding = "UTF-8")
	lines <- strsplit(lines, "\n")[[1]]
	readr::write_lines(lines, file.path("transcripts", paste0(ytid, ".vtt")))
	if (verbose) cat("✅ Transcript saved for:", ytid, "\n")
	}, error = function(e) {
	if (verbose) cat("❌ Transcript download failed:", conditionMessage(e), "\n")
	})
	} else {
	if (verbose) cat("📭 No transcript found for:", ytid, "\n")
	}

	# --- Metadata Construction (FULL schema as in your original) ---
	video_df <- data.frame(
	id = video_meta$id %\|\|% NA,
	title = video_meta$title %\|\|% NA,
	fulltitle = video_meta$fulltitle %\|\|% NA,
	ext = video_meta$ext %\|\|% NA,
	alt_title = video_meta$alt_title %\|\|% NA,
	description = video_meta$description %\|\|% NA,
	display_id = video_meta$display_id %\|\|% NA,
	uploader = video_meta$uploader %\|\|% NA,
	uploader_id = video_meta$uploader_id %\|\|% NA,
	uploader_url = video_meta$uploader_url %\|\|% NA,
	license = video_meta$license %\|\|% NA,
	creators = paste(video_meta$creators, collapse = "; ") %\|\|% NA,
	creator = video_meta$creator %\|\|% NA,
	timestamp = video_meta$timestamp %\|\|% NA,
	upload_date = video_meta$upload_date %\|\|% NA,
	release_timestamp = video_meta$release_timestamp %\|\|% NA,
	release_date = video_meta$release_date %\|\|% NA,
	release_year = video_meta$release_year %\|\|% NA,
	modified_timestamp = video_meta$modified_timestamp %\|\|% NA,
	modified_date = video_meta$modified_date %\|\|% NA,
	channel = video_meta$channel %\|\|% NA,
	channel_id = video_meta$channel_id %\|\|% NA,
	channel_url = video_meta$channel_url %\|\|% NA,
	channel_follower_count = video_meta$channel_follower_count %\|\|% NA,
	channel_is_verified = video_meta$channel_is_verified %\|\|% NA,
	location = video_meta$location %\|\|% NA,
	duration = video_meta$duration %\|\|% NA,
	duration_string = video_meta$duration_string %\|\|% NA,
	view_count = video_meta$view_count %\|\|% NA,
	concurrent_view_count = video_meta$concurrent_view_count %\|\|% NA,
	like_count = video_meta$like_count %\|\|% NA,
	dislike_count = video_meta$dislike_count %\|\|% NA,
	repost_count = video_meta$repost_count %\|\|% NA,
	average_rating = video_meta$average_rating %\|\|% NA,
	comment_count = video_meta$comment_count %\|\|% NA,
	age_limit = video_meta$age_limit %\|\|% NA,
	live_status = video_meta$live_status %\|\|% NA,
	is_live = video_meta$is_live %\|\|% NA,
	was_live = video_meta$was_live %\|\|% NA,
	playable_in_embed = video_meta$playable_in_embed %\|\|% NA,
	availability = video_meta$availability %\|\|% NA,
	media_type = video_meta$media_type %\|\|% NA,
	start_time = video_meta$start_time %\|\|% NA,
	end_time = video_meta$end_time %\|\|% NA,
	extractor = video_meta$extractor %\|\|% NA,
	extractor_key = video_meta$extractor_key %\|\|% NA,
	epoch = video_meta$epoch %\|\|% NA,
	playlist_id = video_meta$playlist_id %\|\|% NA,
	playlist_title = video_meta$playlist_title %\|\|% NA,
	playlist = video_meta$playlist %\|\|% NA,
	playlist_count = video_meta$playlist_count %\|\|% NA,
	playlist_index = video_meta$playlist_index %\|\|% NA,
	playlist_uploader = video_meta$playlist_uploader %\|\|% NA,
	playlist_uploader_id = video_meta$playlist_uploader_id %\|\|% NA,
	playlist_channel = video_meta$playlist_channel %\|\|% NA,
	playlist_channel_id = video_meta$playlist_channel_id %\|\|% NA,
	playlist_webpage_url = video_meta$playlist_webpage_url %\|\|% NA,
	webpage_url = video_meta$webpage_url %\|\|% NA,
	webpage_url_basename = video_meta$webpage_url_basename %\|\|% NA,
	webpage_url_domain = video_meta$webpage_url_domain %\|\|% NA,
	original_url = video_meta$original_url %\|\|% NA,
	categories = paste(video_meta$categories, collapse = "; ") %\|\|% NA,
	tags = paste(video_meta$tags, collapse = "; ") %\|\|% NA,
	cast = paste(video_meta$cast, collapse = "; ") %\|\|% NA
	)

	# keep the raw input url + ytid for easier debugging/joins
	video_df$input_url <- video_url
	video_df$ytid <- ytid

	save_data_efficiently(d = video_df, path = output_csv)

	if (verbose) cat("📅 Metadata saved for:", ytid, "\n\n")
	invisible()
	}

	# --- 5. MAIN EXECUTION ---

	data_with_urls <- tibble(video_url = "https://www.youtube.com/watch?v=BYOeamkqHtc")

	data_with_urls <- force_all_cols_to_char(data_with_urls)

	if (!URL_COLUMN %in% names(data_with_urls)) {
	stop("Column not found: ", URL_COLUMN, call. = FALSE)
	}

	urls <- unique(na.omit(data_with_urls[[URL_COLUMN]]))

	cat("URL column: ", URL_COLUMN, "\n", sep = "")
	cat("Output CSV: ", OUTPUT_CSV, "\n", sep = "")
	cat("Transcript language: ", TRANSCRIPT_LANG, "\n", sep = "")
	cat("Videos to process: ", length(urls), "\n\n", sep = "")

	# Run
	for (u in urls) {
	process_and_save_video(
	video_url = u,
	output_csv = OUTPUT_CSV,
	transcript_lang = TRANSCRIPT_LANG,
	verbose = TRUE
	)
	}

	cat("Done.\n")
No results found