Skip to content

Instantly share code, notes, and snippets.

@Nachtjagdgeschwader
Created April 6, 2017 20:45
Show Gist options
  • Select an option

  • Save Nachtjagdgeschwader/9125e50fe833b870885df55ec74e0783 to your computer and use it in GitHub Desktop.

Select an option

Save Nachtjagdgeschwader/9125e50fe833b870885df55ec74e0783 to your computer and use it in GitHub Desktop.
Collect information about YouTube videos: Date created, Author (Channel Name), Author (Channel) URL, Title, Description, Likes, Dislikes, Number of Views, Number of Comments
library(RSelenium)
psPath <-
"C:/*/phantomjs.exe"
pJS <- phantom(pjs_cmd = psPath)
remDr <- remoteDriver(browserName = "phantomjs")
remDr$open()
# For more details of the initial configuration and
# some commands used below see
# https://gist.github.com/Nachtjagdgeschwader/e0775141c2171f769b8939006ba9624b
URLs <-
read.csv("C:/*/URLs2.csv",
# it's a .csv file with YouTube videos URLs in the first column
header = F)$V1
URLs <- as.character(URLs)
datalist = list()
n <- length(URLs)
library(tcltk)
pb <-
tkProgressBar(
title = "Collencting videos data",
min = 0,
max = n,
width = 200
)
for (i in 1:n) {
tryCatch({
Sys.sleep(0.001)
setTkProgressBar(pb, i, label = paste(round(i / n * 100, 1), "% done"))
remDr$navigate(URLs[i])
Date <-
remDr$findElement(using = 'css', "strong.watch-time-text")$getElementText()
Author <-
remDr$findElement(using = 'css', "div.yt-user-info")$getElementText()
AuthorURL <-
remDr$findElement(using = 'css',
"div.yt-user-info a.g-hovercard.yt-uix-sessionlink.spf-link")$getElementAttribute("href")
URL <-
remDr$findElement(using = 'css', "a.ytp-title-link.yt-uix-sessionlink")$getElementAttribute("href")
Title <-
remDr$findElement(using = 'css', "span#eow-title.watch-title")$getElementText()
Description <-
remDr$findElement(using = 'css', "p#eow-description")$getElementText()
Likes <-
remDr$findElement(using = 'css',
".like-button-renderer-like-button-unclicked .yt-uix-button-content")$getElementText()
Dislikes <-
remDr$findElement(
using = 'css',
".like-button-renderer-dislike-button-unclicked .yt-uix-button-content"
)$getElementText()
Views <-
remDr$findElement(using = 'css', "div.watch-view-count")$getElementText()
remDr$findElement("css", "body")$sendKeysToElement(list(key = "end"))
Sys.sleep(1)
Comments <-
remDr$findElement(using = 'css', "h2.comment-section-header-renderer")$getElementAttribute("outerHTML")
cbind.fill <- function(...) {
nm <- list(...)
nm <- lapply(nm, as.matrix)
n <- max(sapply(nm, nrow))
do.call(cbind, lapply(nm, function (x)
rbind(x, matrix(
, n - nrow(x), ncol(x)
))))
}
data <-
data.frame(
cbind.fill(
Date,
Author,
AuthorURL,
URL,
Title,
Description,
Likes,
Dislikes,
Views,
Comments
)
)
colnames(data) <-
c(
"Date",
"Author",
"AuthorURL",
"URL",
"Title",
"Description",
"Likes",
"Dislikes",
"Views",
"Comments"
)
datalist[[i]] <- data
}, error = function(e) {
})
}
big_data = do.call(rbind, datalist)
nrow(big_data)
big_data1 <-
as.data.frame(big_data, stringsAsFactors = default.stringsAsFactors())
big_data2 <-
as.data.frame(
big_data,
row.names = NULL,
optional = FALSE,
cut.names = FALSE,
col.names = names(
c(
"Date",
"Author",
"AuthorURL",
"URL",
"Title",
"Description",
"Likes",
"Dislikes",
"Views",
"Comments"
)
),
fix.empty.names = TRUE,
stringsAsFactors = default.stringsAsFactors()
)
write.csv2(
big_data,
"C:/*/Data.csv",
# destination of the final data save
col.names = TRUE,
sep = ";"
)
remDr$close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment