Created
April 6, 2017 20:45
-
-
Save Nachtjagdgeschwader/9125e50fe833b870885df55ec74e0783 to your computer and use it in GitHub Desktop.
Collect information about YouTube videos: Date created, Author (Channel Name), Author (Channel) URL, Title, Description, Likes, Dislikes, Number of Views, Number of Comments
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(RSelenium) | |
| psPath <- | |
| "C:/*/phantomjs.exe" | |
| pJS <- phantom(pjs_cmd = psPath) | |
| remDr <- remoteDriver(browserName = "phantomjs") | |
| remDr$open() | |
| # For more details of the initial configuration and | |
| # some commands used below see | |
| # https://gist.github.com/Nachtjagdgeschwader/e0775141c2171f769b8939006ba9624b | |
| URLs <- | |
| read.csv("C:/*/URLs2.csv", | |
| # it's a .csv file with YouTube videos URLs in the first column | |
| header = F)$V1 | |
| URLs <- as.character(URLs) | |
| datalist = list() | |
| n <- length(URLs) | |
| library(tcltk) | |
| pb <- | |
| tkProgressBar( | |
| title = "Collencting videos data", | |
| min = 0, | |
| max = n, | |
| width = 200 | |
| ) | |
| for (i in 1:n) { | |
| tryCatch({ | |
| Sys.sleep(0.001) | |
| setTkProgressBar(pb, i, label = paste(round(i / n * 100, 1), "% done")) | |
| remDr$navigate(URLs[i]) | |
| Date <- | |
| remDr$findElement(using = 'css', "strong.watch-time-text")$getElementText() | |
| Author <- | |
| remDr$findElement(using = 'css', "div.yt-user-info")$getElementText() | |
| AuthorURL <- | |
| remDr$findElement(using = 'css', | |
| "div.yt-user-info a.g-hovercard.yt-uix-sessionlink.spf-link")$getElementAttribute("href") | |
| URL <- | |
| remDr$findElement(using = 'css', "a.ytp-title-link.yt-uix-sessionlink")$getElementAttribute("href") | |
| Title <- | |
| remDr$findElement(using = 'css', "span#eow-title.watch-title")$getElementText() | |
| Description <- | |
| remDr$findElement(using = 'css', "p#eow-description")$getElementText() | |
| Likes <- | |
| remDr$findElement(using = 'css', | |
| ".like-button-renderer-like-button-unclicked .yt-uix-button-content")$getElementText() | |
| Dislikes <- | |
| remDr$findElement( | |
| using = 'css', | |
| ".like-button-renderer-dislike-button-unclicked .yt-uix-button-content" | |
| )$getElementText() | |
| Views <- | |
| remDr$findElement(using = 'css', "div.watch-view-count")$getElementText() | |
| remDr$findElement("css", "body")$sendKeysToElement(list(key = "end")) | |
| Sys.sleep(1) | |
| Comments <- | |
| remDr$findElement(using = 'css', "h2.comment-section-header-renderer")$getElementAttribute("outerHTML") | |
| cbind.fill <- function(...) { | |
| nm <- list(...) | |
| nm <- lapply(nm, as.matrix) | |
| n <- max(sapply(nm, nrow)) | |
| do.call(cbind, lapply(nm, function (x) | |
| rbind(x, matrix( | |
| , n - nrow(x), ncol(x) | |
| )))) | |
| } | |
| data <- | |
| data.frame( | |
| cbind.fill( | |
| Date, | |
| Author, | |
| AuthorURL, | |
| URL, | |
| Title, | |
| Description, | |
| Likes, | |
| Dislikes, | |
| Views, | |
| Comments | |
| ) | |
| ) | |
| colnames(data) <- | |
| c( | |
| "Date", | |
| "Author", | |
| "AuthorURL", | |
| "URL", | |
| "Title", | |
| "Description", | |
| "Likes", | |
| "Dislikes", | |
| "Views", | |
| "Comments" | |
| ) | |
| datalist[[i]] <- data | |
| }, error = function(e) { | |
| }) | |
| } | |
| big_data = do.call(rbind, datalist) | |
| nrow(big_data) | |
| big_data1 <- | |
| as.data.frame(big_data, stringsAsFactors = default.stringsAsFactors()) | |
| big_data2 <- | |
| as.data.frame( | |
| big_data, | |
| row.names = NULL, | |
| optional = FALSE, | |
| cut.names = FALSE, | |
| col.names = names( | |
| c( | |
| "Date", | |
| "Author", | |
| "AuthorURL", | |
| "URL", | |
| "Title", | |
| "Description", | |
| "Likes", | |
| "Dislikes", | |
| "Views", | |
| "Comments" | |
| ) | |
| ), | |
| fix.empty.names = TRUE, | |
| stringsAsFactors = default.stringsAsFactors() | |
| ) | |
| write.csv2( | |
| big_data, | |
| "C:/*/Data.csv", | |
| # destination of the final data save | |
| col.names = TRUE, | |
| sep = ";" | |
| ) | |
| remDr$close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment