-
-
Save briatte/542736520e8b42e6a08e to your computer and use it in GitHub Desktop.
| #' Get a PubMed search index | |
| #' @param query a PubMed search string | |
| #' @return the XML declaration of the search | |
| #' @example | |
| #' # Which articles discuss the WHO FCTC? | |
| #' pubmed_ask("FCTC OR 'Framework Convention on Tobacco Control'") | |
| pubmed_ask <- function(query) { | |
| # change spaces to + and single-quotes to URL-friendly %22 in query | |
| query = gsub("'", "%22", gsub(" ", "+", query)) | |
| query = paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=", | |
| query, "&usehistory=y", sep = "") | |
| cat("Querying PubMed on", as.character(Sys.time()), ":\n", query, "\n\n") | |
| # perform search and save history, this will save PMIDS in history | |
| query = xmlTreeParse(getURL(query), asText = TRUE) | |
| cat(xmlValue(query[["doc"]][["eSearchResult"]][["QueryTranslation"]]), "\n\n") | |
| return(query) | |
| } |
| #' Get PubMed number of articles | |
| #' @param query a PubMed search string or PubMed search index | |
| #' @param max optional cap for the number of articles | |
| #' @example | |
| #' # How many articles on the WHO FCTC? | |
| #' pubmed_count("FCTC OR 'Framework Convention on Tobacco Control'") | |
| pubmed_count <- function(query, max = 0) { | |
| if(is.character(query)) | |
| query = pubmed_ask(query) | |
| stopifnot("XMLDocument" %in% class(query)) | |
| n = as.numeric(xmlValue(query[["doc"]][["eSearchResult"]][["Count"]])) | |
| # cap number of articles (for testing purposes; off by default) | |
| if(max > 0 & max < n) | |
| n = max | |
| return(n) | |
| } |
| #' Get undirected edge list of coauthors | |
| #' | |
| #' The weights are Newman-Fowler (inversely proportional to number of coauthors). | |
| #' @return a data frame with three columns (sender, receiver, weight) | |
| #' @example | |
| #' # Network of authors on the WHO FCTC (not run). | |
| #' # pubmed_get("FCTC OR 'Framework Convention on Tobacco Control'", "fctc") | |
| #' # n = pubmed_edges("pubmed_fctc") | |
| #' # Plot with network package (install first). | |
| #' # require(network) | |
| #' # plot(network(n[ 1:2 ], directed = FALSE)) | |
| pubmed_edges <- function(dir) { | |
| tbl = file.path(dir, dir(dir, ".xml")) | |
| tbl = lapply(tbl, function(x) { | |
| pub = xmlTreeParse(x, useInternalNodes = TRUE) | |
| tbl = xpathSApply(pub, "//PubmedArticle/MedlineCitation") | |
| tbl = lapply(tbl, function(x) { | |
| y = paste(xpathSApply(x, "Article/AuthorList/Author/LastName", xmlValue), | |
| xpathSApply(x, "Article/AuthorList/Author/Initials", xmlValue)) | |
| if(length(y) > 1) { | |
| y = expand.grid(y, y) | |
| y = subset(y, Var1 != Var2) # self-loops | |
| y = unique(apply(y, 1, function(x) paste(sort(x), collapse = ","))) | |
| y = ldply(strsplit(y, ",")) | |
| y = data.frame(xpathApply(x, "PMID", xmlValue), y, 1 / nrow(y)) | |
| names(y) = c("pmid", "i", "j", "w") | |
| } else { | |
| y = data.frame() | |
| } | |
| return(y) | |
| }) | |
| tbl = rbind.fill(tbl) | |
| write.csv(tbl, gsub("xml", "csv", x)) | |
| return(tbl) | |
| }) | |
| tbl = rbind.fill(tbl) | |
| tbl$uid = apply(tbl[, 2:3], 1, function(x) paste(sort(x), collapse = ",")) | |
| # Newman-Fowler weights | |
| tbl = merge(tbl, aggregate(w ~ uid, sum, data = tbl), by = "uid") | |
| tbl = unique(tbl[, c("i", "j", "w.y") ]) | |
| names(tbl)[3] = "w" | |
| return(tbl) | |
| } |
| #' Search and fetch XML records from PubMed | |
| #' | |
| #' @param query a PubMed search string | |
| #' @param file a string to name the batch files and folder (the "pubmed_" prefix will be appended) | |
| #' @param list optionally returns a list of details on the data (off by default) | |
| #' @param max optional cap for the number of articles (off by default) | |
| #' @param k how many articles per batch (1,000 by default) | |
| #' @examples | |
| #' query = "FCTC OR 'Framework Convention on Tobacco Control'" | |
| #' # Scrape approx. 230 articles on the WHO FCTC (not run). | |
| #' # pubmed_get(query, "fctc") | |
| #' # Scrape twice to fix possible network errors (not run). | |
| #' # for(ii in 1:2) try(pubmed_get(query, "fctc")) | |
| #' # Scrape safely and save data details (not run). | |
| #' # for(ii in 1:2) FCTC = try(pubmed_get(query, "fctc", list = TRUE)) | |
| #' # Plot data details (not run). | |
| #' # require(ggplot2) | |
| #' # qplot(data = FCTC$years, x = year, y = count, stat = "identity", geom = "bar") | |
| #' # qplot(data = FCTC$authors, x = authors, y = count, stat = "identity", geom = "bar") | |
| #' # qplot(data = subset(FCTC$journals, count > 5), | |
| #' # x = reorder(journal, count), y = count, stat = "identity", geom = "bar") + | |
| #' # labs(x = NULL) + | |
| #' # coord_flip() | |
| #' @reference \url{http://rpsychologist.com/how-to-download-complete-xml-records-from-pubmed-and-extract-data} | |
| pubmed_get <- function(query, file, list = FALSE, max = 0, k = 10^3) { | |
| stopifnot(max >= 0) | |
| dir = paste0("pubmed_", file) | |
| # where to save the files (batch folder) | |
| if(!file.exists(dir)) | |
| dir.create(dir, showWarnings = FALSE) | |
| # batch file paths and log file path | |
| file = file.path(dir, file) | |
| log = file.path(paste0(dir, ".log")) | |
| sink(log) | |
| # change spaces to + and single-quotes to URL-friendly %22 in query | |
| q = pubmed_ask(query) | |
| # count number of hits | |
| n = pubmed_count(q, max) | |
| # batch download counter | |
| j = 0 | |
| # calculate how many iterations will be needed | |
| r = (n %/% k) + 1 | |
| # print details on the download loop to log file | |
| cat("Downloading", n, "articles in", r, "batch(es) of", k, "entries\n\n") | |
| # save WebEnv-string, containing "links" to all articles in search | |
| q = xmlValue(q[["doc"]][["eSearchResult"]][["WebEnv"]]) | |
| # batch download loop | |
| for(i in r:1) { | |
| x = paste("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&WebEnv=", | |
| q,"&query_key=1&retmode=xml&retstart=", j, "&retmax=", k, sep = "") | |
| y = paste0(file, str_pad(i, nchar(n), pad = "0"), ".xml") | |
| if(!file.exists(y)) { | |
| # download XML based on hits saved in query (WebEnv) | |
| z = try(getURL(x), silent = TRUE) | |
| if(!"try-error" %in% class(z)) { | |
| write(z, y) # save to batch data folder | |
| } else { | |
| warning("Error while downloading batch file #", i) | |
| Sys.sleep(60) # wait a minute before looping | |
| } | |
| } | |
| # save file name and file size to log file | |
| cat(i, "\n") | |
| cat(x, "\nsaved to", y, as.character(file.info(y)$ctime), "\n\n") | |
| # increment to next batch | |
| j = j + k | |
| } | |
| sink() | |
| # announce final file size | |
| cat("Completed download from PubMed:", n, "articles", | |
| as.integer(sum(file.info(file.path(dir, dir(dir, "xml")))$size) / 10^6), | |
| "MB.\n") | |
| if(list) | |
| return(list( | |
| date = Sys.time(), | |
| search = query, | |
| dir = dir, | |
| log = log, | |
| ncid = q, | |
| count = n, | |
| years = pubmed_years(dir, 0), | |
| journals = pubmed_journals(dir, 0), | |
| authors = pubmed_authors(dir, 0) | |
| )) | |
| } |
| #' Get journal titles | |
| #' | |
| #' @param min cut results at minimum number of articles | |
| #' @return a data frame | |
| #' @example | |
| #' # Journal titles for articles on the WHO FCTC (not run). | |
| #' # pubmed_get("FCTC OR 'Framework Convention on Tobacco Control'", "fctc") | |
| #' # pubmed_journals("pubmed_fctc") | |
| pubmed_journals <- function(dir, min = 0) { | |
| tbl = file.path(dir, dir(dir, ".xml")) | |
| tbl = lapply(tbl, function(x) { | |
| pub = xmlTreeParse(x, useInternalNodes = TRUE) | |
| pre = "//PubmedArticle/MedlineCitation/Article/" | |
| tbl = xpathSApply(pub, paste0(pre, "Journal/ISOAbbreviation"), xmlValue) | |
| tbl = table(tbl) | |
| data.frame(journal = names(tbl), count = as.vector(tbl)) | |
| }) | |
| tbl = rbind.fill(tbl) | |
| tbl = aggregate(count ~ journal, sum, data = tbl) | |
| tbl = subset(tbl, count >= min) | |
| return(tbl[ order(-tbl$count), ]) | |
| } |
| #' Get counts of authors (by name) | |
| #' | |
| #' @param min cut results at minimum number of articles | |
| #' @return a data frame | |
| #' @example | |
| #' # Years of publication for articles on the WHO FCTC (not run). | |
| #' # pubmed_get("FCTC OR 'Framework Convention on Tobacco Control'", "fctc") | |
| #' # pubmed_names("pubmed_fctc", 6) | |
| pubmed_names <- function(dir, min = 0) { | |
| tbl = file.path(dir, dir(dir, ".xml")) | |
| tbl = lapply(tbl, function(x) { | |
| pub = xmlTreeParse(x, useInternalNodes = TRUE) | |
| tbl = xpathSApply(pub, "//PubmedArticle/MedlineCitation/Article") | |
| last = lapply(tbl, xpathSApply, "AuthorList/Author/LastName", xmlValue) | |
| init = lapply(tbl, xpathSApply, "AuthorList/Author/Initials", xmlValue) | |
| stopifnot(length(unlist(last)) == length(unlist(init))) | |
| tbl = paste(unlist(last), substr(unlist(init), 1, 1)) | |
| tbl = table(tbl)[ table(tbl) >= min ] | |
| data.frame(author = names(tbl), count = tbl, stringsAsFactors = FALSE) | |
| }) | |
| tbl = rbind.fill(tbl) | |
| tbl = aggregate(count ~ author, sum, data = tbl) | |
| return(tbl[ order(-tbl$count), ]) | |
| } |
| #' Get number of publications per journal and year | |
| #' | |
| #' @param min cut results at minimum number of articles | |
| #' @param top label top n journals (5 by default) | |
| #' @param regex regular expression to include additional journal titles | |
| #' @return a data frame, ordered by journal-year | |
| #' @example | |
| #' # Years of publication for articles on the WHO FCTC (not run). | |
| #' # pubmed_get("FCTC OR 'Framework Convention on Tobacco Control'", "fctc") | |
| #' # FCTC = pubmed_timeline("pubmed_fctc", 0, regex = "World Health Org|BMJ $|Lancet|Tob Control") | |
| #' # Plot the results (not run). | |
| #' # require(ggplot2) | |
| #' # qplot(data = FCTC[ order(FCTC$label), ], x = year, y = count, fill = label, | |
| #' # stat = "identity", geom = "bar") + | |
| #' # scale_fill_brewer("Journal", palette = "Set3", na.value = "grey") + | |
| #' # labs(x = NULL, y = NULL) + | |
| #' # theme(legend.position = "bottom") | |
| pubmed_timeline <- function(dir, min = 0, top = 3, regex = NULL) { | |
| tbl = file.path(dir, dir(dir, ".xml")) | |
| tbl = lapply(tbl, function(x) { | |
| pub = xmlTreeParse(x, useInternalNodes = TRUE) | |
| pre = "//PubmedArticle/MedlineCitation/" | |
| jl = xpathSApply(pub, paste0(pre, "Article/Journal/ISOAbbreviation"), xmlValue) | |
| yr = xpathSApply(pub, paste0(pre, "DateCreated/Year"), xmlValue) | |
| tbl = table(paste(jl, yr)) | |
| data.frame(journal = names(tbl), count = as.vector(tbl)) | |
| }) | |
| tbl = rbind.fill(tbl) | |
| tbl = aggregate(count ~ journal, sum, data = tbl) | |
| tbl = subset(tbl, count >= min) | |
| tbl = data.frame(journal = str_sub(tbl$journal, 1, -5), | |
| year = str_sub(tbl$journal, -4, -1), | |
| count = tbl$count) | |
| tbl$year = as.numeric(as.character(tbl$year)) | |
| if(top > 0) { | |
| # find top journals | |
| label = aggregate(count ~ journal, sum, data = pt) | |
| label = as.character(label[ order(-label$count), 1 ]) | |
| tbl$label = factor(tbl$journal, levels = label, ordered = TRUE) | |
| # trim and refactor | |
| label = head(levels(tbl$label), top) | |
| if(!is.null(regex)) | |
| label = c(label, levels(tbl$label)[ grepl(regex, levels(tbl$label)) ]) | |
| cat("Labeled journals:\n", paste0(label, collapse = ", ")) | |
| tbl$label[ !tbl$label %in% label ] = NA | |
| } | |
| return(tbl[ order(sort(-tbl$year)), ]) | |
| } |
| #' Get years of publication | |
| #' | |
| #' @param min cut results at minimum number of articles | |
| #' @return a data frame, ordered by year | |
| #' @example | |
| #' # Years of publication for articles on the WHO FCTC (not run). | |
| #' # pubmed_get("FCTC OR 'Framework Convention on Tobacco Control'", "fctc") | |
| #' # pubmed_years("pubmed_fctc", 0) | |
| pubmed_years <- function(dir, min = 0) { | |
| tbl = file.path(dir, dir(dir, ".xml")) | |
| tbl = lapply(tbl, function(x) { | |
| pub = xmlTreeParse(x, useInternalNodes = TRUE) | |
| pre = "//PubmedArticle/PubmedData/History/" | |
| tbl = xpathSApply(pub, paste0(pre, "PubMedPubDate[@PubStatus='medline']/Year"), xmlValue) | |
| tbl = table(tbl) | |
| data.frame(year = names(tbl), count = as.vector(tbl)) | |
| }) | |
| tbl = rbind.fill(tbl) | |
| tbl = aggregate(count ~ year, sum, data = tbl) | |
| tbl = subset(tbl, count >= min) | |
| tbl$year = as.numeric(as.character(tbl$year)) | |
| tbl$year = factor(tbl$year, levels = min(tbl$year):max(tbl$year)) | |
| return(tbl[ order(sort(-as.numeric(tbl$year))), ]) | |
| } |
Thank you for your great code but I am having a few problems:
a) easy one but you should maybe list the prerequisite libraries. I found out they are at least: XML and Curl
b) Trying out your pubmed_ask I get the following error:
Querying PubMed on 2017-11-05 11:17:41 :
http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=FCTC+OR+%22Framework+Convention+on+Tobacco+Control%22&usehistory=y
Space required after the Public Identifier
SystemLiteral " or ' expected
SYSTEM or PUBLIC, the URI is missing
Any solution please? Thank you
I was getting the same error. Looks like eutils moved from http to https. Fixing the url in the function to "https://..." seemed to do the trick for me. Good luck!
Briatte, this is a great resource! Thanks for sharing!
Hi Briatte,
I am working on extracting pubmed info, but using rentrez package in R. I am curious to know how are you addressing NULL values at nodes. I am trying to extract abstracts from pubmed. I have posted a Q on SO, if you have suggestion please let me know.
https://stackoverflow.com/questions/32953096/xpathapply-how-to-address-empty-lists