Last active
March 10, 2026 16:40
-
-
Save mathzero/70f33fa243ba0bb4f46b72b8a13646e0 to your computer and use it in GitHub Desktop.
Programatically search PubMed
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Standalone PubMed literature search script | |
| # | |
| # Install packages once: | |
| # install.packages(c("rentrez", "xml2")) | |
| # | |
| # Getting an NCBI API key: | |
| # 1. Create or sign in to a My NCBI account: | |
| # https://www.ncbi.nlm.nih.gov/account/ | |
| # 2. Open Account Settings and create a key in "API Key Management". | |
| # 3. Save the key for your R session: | |
| # Sys.setenv(ENTREZ_KEY = "your_key_here") | |
| # | |
| # Official NCBI references: | |
| # - API key setup: https://www.ncbi.nlm.nih.gov/sites/books/NBK53593/ | |
| # - E-utilities usage limits: https://www.ncbi.nlm.nih.gov/books/NBK25497/ | |
| # | |
| # Notes: | |
| # - The script works without an API key, but NCBI allows fewer requests per second. | |
| # - This script accepts either a PubMed query string or a vector of PMIDs. | |
| # - Set retrieve_abstract = TRUE if you also want abstracts and affiliations. | |
| # | |
| # Example query: | |
| # query <- paste( | |
| # "\"atopic dermatitis\"[Title/Abstract]", | |
| # "AND dupilumab[Title/Abstract]", | |
| # "AND (2020/01/01[Date - Publication] : 2025/12/31[Date - Publication])" | |
| # ) | |
| # results <- fetch_pubmed_data(query, retrieve_abstract = TRUE) | |
| # head(results[, c("uid", "title", "pubdate", "fulljournalname", "doi")]) | |
| check_pubmed_packages <- function() { | |
| missing_pkgs <- c() | |
| if (!requireNamespace("rentrez", quietly = TRUE)) missing_pkgs <- c(missing_pkgs, "rentrez") | |
| if (!requireNamespace("xml2", quietly = TRUE)) missing_pkgs <- c(missing_pkgs, "xml2") | |
| if (length(missing_pkgs) > 0) { | |
| stop( | |
| "Please install the required package(s): ", | |
| paste(missing_pkgs, collapse = ", "), | |
| call. = FALSE | |
| ) | |
| } | |
| } | |
| normalize_api_key <- function(api_key = NULL) { | |
| if (is.null(api_key) || length(api_key) == 0L || is.na(api_key[[1]]) || !nzchar(api_key[[1]])) { | |
| env_key <- Sys.getenv("ENTREZ_KEY", unset = "") | |
| if (!nzchar(env_key)) return(NULL) | |
| return(env_key) | |
| } | |
| api_key[[1]] | |
| } | |
| default_delay <- function(api_key, delay = NULL) { | |
| if (!is.null(delay)) return(delay) | |
| if (is.null(api_key)) return(0.35) | |
| 0.11 | |
| } | |
| as_int_safe <- function(x, default = 0L) { | |
| out <- suppressWarnings(as.integer(x)) | |
| if (length(out) == 0L || is.na(out[[1]])) return(as.integer(default)) | |
| out[[1]] | |
| } | |
| retry_entrez_call <- function(fun, context, max_retries = 3, delay = 0.35) { | |
| attempt <- 0L | |
| wait_time <- delay | |
| repeat { | |
| result <- tryCatch(fun(), error = identity) | |
| if (!inherits(result, "error")) return(result) | |
| attempt <- attempt + 1L | |
| if (attempt > max_retries) { | |
| stop("Maximum retries exceeded for ", context, ". Last error: ", conditionMessage(result), call. = FALSE) | |
| } | |
| message(context, " failed: ", conditionMessage(result)) | |
| message("Retrying in ", wait_time, " seconds...") | |
| Sys.sleep(wait_time) | |
| wait_time <- wait_time * 2 | |
| } | |
| } | |
| empty_pubmed_df <- function() { | |
| data.frame( | |
| uid = character(0), | |
| pubdate = character(0), | |
| epubdate = character(0), | |
| source = character(0), | |
| authors = character(0), | |
| lastauthor = character(0), | |
| title = character(0), | |
| volume = character(0), | |
| issue = character(0), | |
| pages = character(0), | |
| lang = character(0), | |
| issn = character(0), | |
| essn = character(0), | |
| pubtype = character(0), | |
| pmcrefcount = integer(0), | |
| fulljournalname = character(0), | |
| elocationid = character(0), | |
| doi = character(0), | |
| abstract = character(0), | |
| affiliations = character(0), | |
| stringsAsFactors = FALSE | |
| ) | |
| } | |
| empty_details_df <- function() { | |
| data.frame( | |
| uid = character(0), | |
| abstract = character(0), | |
| affiliations = character(0), | |
| stringsAsFactors = FALSE | |
| ) | |
| } | |
| last_day_of_month <- function(date_obj) { | |
| date_obj <- as.Date(format(date_obj, "%Y-%m-01")) | |
| next_month_first_day <- seq(date_obj, length.out = 2, by = "1 month")[2] | |
| next_month_first_day - 1 | |
| } | |
| parse_flexible_date <- function(date_str, is_end_date = FALSE) { | |
| date_str <- gsub('^"|"$', "", trimws(date_str)) | |
| formats_to_try <- c("%Y/%m/%d", "%Y-%m-%d", "%Y/%m", "%Y-%m", "%Y") | |
| for (fmt in formats_to_try) { | |
| parsed_date <- tryCatch(as.Date(date_str, format = fmt), error = function(e) NA) | |
| if (!is.na(parsed_date)) { | |
| if (fmt %in% c("%Y/%m", "%Y-%m") && isTRUE(is_end_date)) { | |
| parsed_date <- last_day_of_month(parsed_date) | |
| } | |
| if (fmt == "%Y") { | |
| parsed_date <- if (isTRUE(is_end_date)) { | |
| as.Date(paste0(date_str, "-12-31")) | |
| } else { | |
| as.Date(paste0(date_str, "-01-01")) | |
| } | |
| } | |
| return(parsed_date) | |
| } | |
| } | |
| NA | |
| } | |
| clean_query_text <- function(x) { | |
| x <- gsub("\\s+", " ", trimws(x)) | |
| x <- gsub("^AND\\s+", "", x, ignore.case = TRUE) | |
| x <- gsub("\\s+AND$", "", x, ignore.case = TRUE) | |
| trimws(x) | |
| } | |
| build_date_query <- function(query_base, start_date_obj, end_date_obj) { | |
| date_clause <- paste0( | |
| "(\"", | |
| format(start_date_obj, "%Y/%m/%d"), | |
| "\"[Date - Publication] : \"", | |
| format(end_date_obj, "%Y/%m/%d"), | |
| "\"[Date - Publication])" | |
| ) | |
| query_base <- clean_query_text(query_base) | |
| if (!nzchar(query_base)) return(date_clause) | |
| paste(query_base, "AND", date_clause) | |
| } | |
| extract_date_range_from_query <- function(query_string) { | |
| patterns <- c( | |
| "\\(?\\s*\"?(\\d{4}(?:[/-]\\d{2})?(?:[/-]\\d{2})?)\"?\\s*\\[(?:Date\\s*-\\s*Publication|DP|PDat)\\]\\s*[:\\-]\\s*\"?(\\d{4}(?:[/-]\\d{2})?(?:[/-]\\d{2})?)\"?\\s*\\[(?:Date\\s*-\\s*Publication|DP|PDat)\\]\\s*\\)?" | |
| ) | |
| for (pattern in patterns) { | |
| matches <- regexec(pattern, query_string, perl = TRUE) | |
| match_data <- regmatches(query_string, matches)[[1]] | |
| if (length(match_data) >= 3L) { | |
| matched_component <- match_data[[1]] | |
| query_without_date <- clean_query_text(sub(matched_component, "", query_string, fixed = TRUE)) | |
| return(list( | |
| found = TRUE, | |
| matched_component = matched_component, | |
| start_date = match_data[[2]], | |
| end_date = match_data[[3]], | |
| query_without_date = query_without_date | |
| )) | |
| } | |
| } | |
| list( | |
| found = FALSE, | |
| matched_component = NULL, | |
| start_date = "1900/01/01", | |
| end_date = format(Sys.Date(), "%Y/%m/%d"), | |
| query_without_date = clean_query_text(query_string) | |
| ) | |
| } | |
| safe_xml_text <- function(node) { | |
| tryCatch(xml2::xml_text(node), error = function(e) NA_character_) | |
| } | |
| node_exists <- function(node) { | |
| !is.null(node) && length(node) > 0L && !inherits(node, "xml_missing") | |
| } | |
| parse_pubmed_xml_details <- function(xml_content) { | |
| if (!nzchar(xml_content)) return(empty_details_df()) | |
| xml_records <- xml2::read_xml(xml_content) | |
| articles <- xml2::xml_find_all(xml_records, "//PubmedArticle") | |
| if (length(articles) == 0L) return(empty_details_df()) | |
| rows <- vector("list", length(articles)) | |
| for (i in seq_along(articles)) { | |
| rows[[i]] <- tryCatch({ | |
| article <- articles[[i]] | |
| pmid_node <- xml2::xml_find_first(article, ".//MedlineCitation/PMID") | |
| pmid_text <- if (node_exists(pmid_node)) safe_xml_text(pmid_node) else NA_character_ | |
| abstract_nodes <- xml2::xml_find_all(article, ".//Article/Abstract/AbstractText") | |
| abstract_text <- if (length(abstract_nodes) > 0L) { | |
| parts <- vapply(abstract_nodes, safe_xml_text, FUN.VALUE = character(1)) | |
| parts <- parts[!is.na(parts) & nzchar(trimws(parts))] | |
| if (length(parts) > 0L) paste(parts, collapse = " ") else NA_character_ | |
| } else { | |
| NA_character_ | |
| } | |
| author_nodes <- xml2::xml_find_all(article, ".//Article/AuthorList/Author") | |
| affiliations <- character(0) | |
| if (length(author_nodes) > 0L) { | |
| for (author_node in author_nodes) { | |
| affiliation_nodes <- xml2::xml_find_all( | |
| author_node, | |
| ".//AffiliationInfo/Affiliation | self::Author/Affiliation[not(ancestor::AffiliationInfo)]" | |
| ) | |
| if (length(affiliation_nodes) > 0L) { | |
| affils <- vapply(affiliation_nodes, safe_xml_text, FUN.VALUE = character(1)) | |
| affils <- affils[!is.na(affils) & nzchar(trimws(affils))] | |
| affiliations <- c(affiliations, affils) | |
| } | |
| } | |
| } | |
| data.frame( | |
| uid = pmid_text, | |
| abstract = abstract_text, | |
| affiliations = if (length(affiliations) > 0L) paste(unique(affiliations), collapse = " ## ") else NA_character_, | |
| stringsAsFactors = FALSE | |
| ) | |
| }, error = function(e) NULL) | |
| } | |
| rows <- Filter(Negate(is.null), rows) | |
| if (length(rows) == 0L) return(empty_details_df()) | |
| do.call(rbind, rows) | |
| } | |
| summaries_to_df <- function(all_summaries) { | |
| summary_records <- if (inherits(all_summaries, "esummary")) { | |
| list(all_summaries) | |
| } else if (inherits(all_summaries, "esummary_list")) { | |
| unclass(all_summaries) | |
| } else if (is.list(all_summaries)) { | |
| all_summaries | |
| } else { | |
| list() | |
| } | |
| rows <- lapply(summary_records, function(x) { | |
| if (!is.list(x)) return(NULL) | |
| authors_list <- tryCatch({ | |
| if (!is.null(x$authors)) { | |
| if (is.data.frame(x$authors) && nrow(x$authors) > 0L && "name" %in% names(x$authors)) { | |
| paste(x$authors$name, collapse = "; ") | |
| } else if (is.list(x$authors) && length(x$authors) > 0L) { | |
| author_names <- vapply( | |
| x$authors, | |
| function(author) { | |
| if (is.list(author) && "name" %in% names(author)) return(author$name) | |
| if (is.character(author) && length(author) > 0L) return(author[[1]]) | |
| NA_character_ | |
| }, | |
| FUN.VALUE = character(1) | |
| ) | |
| author_names <- author_names[!is.na(author_names) & nzchar(author_names)] | |
| if (length(author_names) > 0L) { | |
| paste(author_names, collapse = "; ") | |
| } else { | |
| NA_character_ | |
| } | |
| } else if (is.atomic(x$authors) && length(x$authors) > 0L) { | |
| author_values <- as.character(stats::na.omit(x$authors)) | |
| if (length(author_values) > 0L) { | |
| paste(author_values, collapse = "; ") | |
| } else { | |
| NA_character_ | |
| } | |
| } else { | |
| NA_character_ | |
| } | |
| } else { | |
| NA_character_ | |
| } | |
| }, error = function(e) NA_character_) | |
| data.frame( | |
| uid = if (!is.null(x$uid)) x$uid else NA_character_, | |
| pubdate = if (!is.null(x$pubdate)) x$pubdate else NA_character_, | |
| epubdate = if (!is.null(x$epubdate)) x$epubdate else NA_character_, | |
| source = if (!is.null(x$source)) x$source else NA_character_, | |
| authors = authors_list, | |
| lastauthor = if (!is.null(x$lastauthor)) x$lastauthor else NA_character_, | |
| title = if (!is.null(x$title)) x$title else NA_character_, | |
| volume = if (!is.null(x$volume)) x$volume else NA_character_, | |
| issue = if (!is.null(x$issue)) x$issue else NA_character_, | |
| pages = if (!is.null(x$pages)) x$pages else NA_character_, | |
| lang = if (!is.null(x$lang)) paste(x$lang, collapse = "; ") else NA_character_, | |
| issn = if (!is.null(x$issn)) x$issn else NA_character_, | |
| essn = if (!is.null(x$essn)) x$essn else NA_character_, | |
| pubtype = if (!is.null(x$pubtype)) paste(x$pubtype, collapse = "; ") else NA_character_, | |
| pmcrefcount = if (!is.null(x$pmcrefcount)) as_int_safe(x$pmcrefcount, NA_integer_) else NA_integer_, | |
| fulljournalname = if (!is.null(x$fulljournalname)) x$fulljournalname else NA_character_, | |
| elocationid = if (!is.null(x$elocationid)) x$elocationid else NA_character_, | |
| doi = if (!is.null(x$articleids) && is.data.frame(x$articleids) && | |
| all(c("idtype", "value") %in% names(x$articleids))) { | |
| doi_index <- which(x$articleids$idtype == "doi") | |
| if (length(doi_index) > 0L) x$articleids$value[[doi_index[[1]]]] else NA_character_ | |
| } else { | |
| NA_character_ | |
| }, | |
| stringsAsFactors = FALSE | |
| ) | |
| }) | |
| rows <- Filter(Negate(is.null), rows) | |
| if (length(rows) == 0L) return(empty_pubmed_df()) | |
| df <- as.data.frame(do.call(rbind, rows), stringsAsFactors = FALSE) | |
| df$pmcrefcount <- suppressWarnings(as.integer(df$pmcrefcount)) | |
| df$abstract <- NA_character_ | |
| df$affiliations <- NA_character_ | |
| df | |
| } | |
| fetch_details_from_ids <- function(uid_vec, | |
| api_key = NULL, | |
| batch_size = 100L, | |
| max_retries = 3, | |
| delay = 0.35) { | |
| uid_vec <- unique(as.character(uid_vec)) | |
| uid_vec <- uid_vec[!is.na(uid_vec) & nzchar(uid_vec)] | |
| if (length(uid_vec) == 0L) return(empty_details_df()) | |
| api_key <- normalize_api_key(api_key) | |
| delay <- default_delay(api_key, delay) | |
| batch_size <- max(20L, min(200L, as_int_safe(batch_size, 100L))) | |
| id_batches <- split(uid_vec, ceiling(seq_along(uid_vec) / batch_size)) | |
| details <- vector("list", length(id_batches)) | |
| for (i in seq_along(id_batches)) { | |
| id_chunk <- id_batches[[i]] | |
| message(sprintf("Fetching abstract batch %d/%d by PMID...", i, length(id_batches))) | |
| xml_content <- retry_entrez_call( | |
| fun = function() { | |
| posted_history <- rentrez::entrez_post(db = "pubmed", id = id_chunk, api_key = api_key) | |
| rentrez::entrez_fetch( | |
| db = "pubmed", | |
| web_history = posted_history, | |
| rettype = "xml", | |
| retstart = 0, | |
| retmax = length(id_chunk), | |
| parsed = FALSE, | |
| api_key = api_key | |
| ) | |
| }, | |
| context = sprintf("PMID XML fetch batch %d/%d", i, length(id_batches)), | |
| max_retries = max_retries, | |
| delay = delay | |
| ) | |
| details[[i]] <- parse_pubmed_xml_details(xml_content) | |
| } | |
| out <- do.call(rbind, details) | |
| if (is.null(out) || !is.data.frame(out) || nrow(out) == 0L) return(empty_details_df()) | |
| out <- out[!is.na(out$uid) & nzchar(out$uid), , drop = FALSE] | |
| row.names(out) <- NULL | |
| out | |
| } | |
| fetch_details_from_history <- function(web_history, | |
| total_records, | |
| api_key = NULL, | |
| batch_size = 200L, | |
| max_retries = 3, | |
| delay = 0.35) { | |
| if (total_records <= 0L) return(empty_details_df()) | |
| api_key <- normalize_api_key(api_key) | |
| delay <- default_delay(api_key, delay) | |
| starts <- seq(0, total_records - 1L, by = batch_size) | |
| details <- vector("list", length(starts)) | |
| for (i in seq_along(starts)) { | |
| retstart <- starts[[i]] | |
| message(sprintf("Fetching abstract batch %d/%d from PubMed history...", i, length(starts))) | |
| xml_content <- retry_entrez_call( | |
| fun = function() { | |
| rentrez::entrez_fetch( | |
| db = "pubmed", | |
| web_history = web_history, | |
| rettype = "xml", | |
| retstart = retstart, | |
| retmax = batch_size, | |
| parsed = FALSE, | |
| api_key = api_key | |
| ) | |
| }, | |
| context = sprintf("History XML fetch batch %d/%d", i, length(starts)), | |
| max_retries = max_retries, | |
| delay = delay | |
| ) | |
| details[[i]] <- parse_pubmed_xml_details(xml_content) | |
| } | |
| out <- do.call(rbind, details) | |
| if (is.null(out) || !is.data.frame(out) || nrow(out) == 0L) return(empty_details_df()) | |
| out <- out[!is.na(out$uid) & nzchar(out$uid), , drop = FALSE] | |
| row.names(out) <- NULL | |
| out | |
| } | |
| process_records_from_history <- function(web_history, | |
| total_records, | |
| batch_size = 500L, | |
| api_key = NULL, | |
| max_retries = 3, | |
| delay = NULL, | |
| retrieve_abstract = FALSE) { | |
| if (total_records <= 0L) return(empty_pubmed_df()) | |
| api_key <- normalize_api_key(api_key) | |
| delay <- default_delay(api_key, delay) | |
| batch_size <- max(1L, min(as_int_safe(batch_size, 500L), 500L)) | |
| starts <- seq(0, total_records - 1L, by = batch_size) | |
| summaries_list <- vector("list", length(starts)) | |
| for (i in seq_along(starts)) { | |
| retstart <- starts[[i]] | |
| message(sprintf("Fetching summary batch %d/%d...", i, length(starts))) | |
| summaries_list[[i]] <- retry_entrez_call( | |
| fun = function() { | |
| rentrez::entrez_summary( | |
| db = "pubmed", | |
| web_history = web_history, | |
| retstart = retstart, | |
| retmax = batch_size, | |
| api_key = api_key | |
| ) | |
| }, | |
| context = sprintf("Summary fetch batch %d/%d", i, length(starts)), | |
| max_retries = max_retries, | |
| delay = delay | |
| ) | |
| } | |
| summary_dfs <- lapply(summaries_list, summaries_to_df) | |
| summary_dfs <- Filter(function(x) is.data.frame(x) && nrow(x) > 0L, summary_dfs) | |
| if (length(summary_dfs) == 0L) return(empty_pubmed_df()) | |
| df <- as.data.frame(do.call(rbind, summary_dfs), stringsAsFactors = FALSE) | |
| row.names(df) <- NULL | |
| if (isTRUE(retrieve_abstract)) { | |
| details_df <- tryCatch( | |
| fetch_details_from_history( | |
| web_history = web_history, | |
| total_records = total_records, | |
| api_key = api_key, | |
| batch_size = 200L, | |
| max_retries = max_retries, | |
| delay = delay | |
| ), | |
| error = function(e) { | |
| message("History-based abstract retrieval failed. Falling back to PMID batches.") | |
| fetch_details_from_ids( | |
| uid_vec = df$uid, | |
| api_key = api_key, | |
| batch_size = 100L, | |
| max_retries = max_retries, | |
| delay = delay | |
| ) | |
| } | |
| ) | |
| if (is.data.frame(details_df) && nrow(details_df) > 0L) { | |
| details_df$uid <- as.character(details_df$uid) | |
| df$uid <- as.character(df$uid) | |
| row_index <- match(df$uid, details_df$uid) | |
| matched <- !is.na(row_index) | |
| df$abstract[matched] <- details_df$abstract[row_index[matched]] | |
| df$affiliations[matched] <- details_df$affiliations[row_index[matched]] | |
| } | |
| } | |
| df <- df[!duplicated(df$uid), , drop = FALSE] | |
| row.names(df) <- NULL | |
| df | |
| } | |
| fetch_data_for_query_chunk <- function(query, | |
| batch_size = 500L, | |
| api_key = NULL, | |
| max_retries = 3, | |
| delay = NULL, | |
| retrieve_abstract = FALSE, | |
| label = NULL) { | |
| api_key <- normalize_api_key(api_key) | |
| delay <- default_delay(api_key, delay) | |
| search_results <- retry_entrez_call( | |
| fun = function() { | |
| rentrez::entrez_search( | |
| db = "pubmed", | |
| term = query, | |
| use_history = TRUE, | |
| retmax = 0, | |
| api_key = api_key | |
| ) | |
| }, | |
| context = if (is.null(label)) "PubMed search" else paste0("PubMed search: ", label), | |
| max_retries = max_retries, | |
| delay = delay | |
| ) | |
| total_records <- as_int_safe(search_results$count, 0L) | |
| label_text <- if (is.null(label)) query else label | |
| message(sprintf("%s matched %d record(s).", label_text, total_records)) | |
| if (total_records == 0L) return(empty_pubmed_df()) | |
| process_records_from_history( | |
| web_history = search_results$web_history, | |
| total_records = total_records, | |
| batch_size = batch_size, | |
| api_key = api_key, | |
| max_retries = max_retries, | |
| delay = delay, | |
| retrieve_abstract = retrieve_abstract | |
| ) | |
| } | |
| split_date_range <- function(query_base, | |
| start_date_obj, | |
| end_date_obj, | |
| batch_size = 500L, | |
| api_key = NULL, | |
| max_retries = 3, | |
| delay = NULL, | |
| retrieve_abstract = FALSE) { | |
| api_key <- normalize_api_key(api_key) | |
| delay <- default_delay(api_key, delay) | |
| interval_query <- build_date_query(query_base, start_date_obj, end_date_obj) | |
| search_results <- retry_entrez_call( | |
| fun = function() { | |
| rentrez::entrez_search(db = "pubmed", term = interval_query, retmax = 0, api_key = api_key) | |
| }, | |
| context = sprintf( | |
| "Date-range count %s to %s", | |
| format(start_date_obj, "%Y/%m/%d"), | |
| format(end_date_obj, "%Y/%m/%d") | |
| ), | |
| max_retries = max_retries, | |
| delay = delay | |
| ) | |
| sub_total <- as_int_safe(search_results$count, 0L) | |
| interval_label <- sprintf( | |
| "Date range %s to %s", | |
| format(start_date_obj, "%Y/%m/%d"), | |
| format(end_date_obj, "%Y/%m/%d") | |
| ) | |
| if (sub_total == 0L) { | |
| message(interval_label, ": 0 records found.") | |
| return(empty_pubmed_df()) | |
| } | |
| if (sub_total <= 10000L) { | |
| return(fetch_data_for_query_chunk( | |
| query = interval_query, | |
| batch_size = batch_size, | |
| api_key = api_key, | |
| max_retries = max_retries, | |
| delay = delay, | |
| retrieve_abstract = retrieve_abstract, | |
| label = interval_label | |
| )) | |
| } | |
| message(interval_label, ": ", sub_total, " records found. Splitting further.") | |
| mid_point_days <- floor(as.numeric(end_date_obj - start_date_obj) / 2) | |
| if (mid_point_days == 0) { | |
| months_seq <- seq.Date(start_date_obj, end_date_obj, by = "1 month") | |
| if (length(months_seq) <= 1L || (length(months_seq) == 2L && months_seq[[2]] > end_date_obj)) { | |
| warning( | |
| "Date range ", | |
| format(start_date_obj, "%Y/%m/%d"), | |
| " to ", | |
| format(end_date_obj, "%Y/%m/%d"), | |
| " still returns >10,000 records and cannot be split further." | |
| ) | |
| return(fetch_data_for_query_chunk( | |
| query = interval_query, | |
| batch_size = batch_size, | |
| api_key = api_key, | |
| max_retries = max_retries, | |
| delay = delay, | |
| retrieve_abstract = retrieve_abstract, | |
| label = interval_label | |
| )) | |
| } | |
| combined_df <- empty_pubmed_df() | |
| current_start <- start_date_obj | |
| while (current_start <= end_date_obj) { | |
| sub_end <- min(last_day_of_month(current_start), end_date_obj) | |
| df_sub <- split_date_range( | |
| query_base = query_base, | |
| start_date_obj = current_start, | |
| end_date_obj = sub_end, | |
| batch_size = batch_size, | |
| api_key = api_key, | |
| max_retries = max_retries, | |
| delay = delay, | |
| retrieve_abstract = retrieve_abstract | |
| ) | |
| combined_df <- rbind(combined_df, df_sub) | |
| current_start <- sub_end + 1 | |
| } | |
| return(combined_df) | |
| } | |
| mid_date_obj <- start_date_obj + mid_point_days | |
| df_first_half <- split_date_range( | |
| query_base = query_base, | |
| start_date_obj = start_date_obj, | |
| end_date_obj = mid_date_obj, | |
| batch_size = batch_size, | |
| api_key = api_key, | |
| max_retries = max_retries, | |
| delay = delay, | |
| retrieve_abstract = retrieve_abstract | |
| ) | |
| df_second_half <- split_date_range( | |
| query_base = query_base, | |
| start_date_obj = mid_date_obj + 1, | |
| end_date_obj = end_date_obj, | |
| batch_size = batch_size, | |
| api_key = api_key, | |
| max_retries = max_retries, | |
| delay = delay, | |
| retrieve_abstract = retrieve_abstract | |
| ) | |
| out <- rbind(df_first_half, df_second_half) | |
| out <- out[!duplicated(out$uid), , drop = FALSE] | |
| row.names(out) <- NULL | |
| out | |
| } | |
| fetch_pubmed_data <- function(input_source, | |
| batch_size = 500, | |
| api_key = NULL, | |
| max_retries = 3, | |
| delay = NULL, | |
| retrieve_abstract = FALSE) { | |
| check_pubmed_packages() | |
| api_key <- normalize_api_key(api_key) | |
| delay <- default_delay(api_key, delay) | |
| batch_size <- max(1L, as_int_safe(batch_size, 500L)) | |
| max_retries <- max(0L, as_int_safe(max_retries, 3L)) | |
| is_pmid_vector <- ((is.numeric(input_source) && | |
| all(!is.na(input_source)) && | |
| all(input_source == floor(input_source))) || | |
| (is.character(input_source) && all(grepl("^[0-9]+$", input_source)))) && | |
| length(input_source) > 0L | |
| if (isTRUE(is_pmid_vector)) { | |
| pmid_list <- unique(as.character(input_source)) | |
| pmid_list <- pmid_list[!is.na(pmid_list) & grepl("^[0-9]+$", pmid_list)] | |
| if (length(pmid_list) == 0L) return(empty_pubmed_df()) | |
| message(sprintf("Input detected as PMIDs (%d record(s)).", length(pmid_list))) | |
| chunk_size <- 2000L | |
| pmid_chunks <- split(pmid_list, ceiling(seq_along(pmid_list) / chunk_size)) | |
| all_chunks <- vector("list", length(pmid_chunks)) | |
| for (i in seq_along(pmid_chunks)) { | |
| message(sprintf("Posting PMID chunk %d/%d...", i, length(pmid_chunks))) | |
| posted_history <- retry_entrez_call( | |
| fun = function() { | |
| rentrez::entrez_post(db = "pubmed", id = pmid_chunks[[i]], api_key = api_key) | |
| }, | |
| context = sprintf("PMID post chunk %d/%d", i, length(pmid_chunks)), | |
| max_retries = max_retries, | |
| delay = delay | |
| ) | |
| all_chunks[[i]] <- process_records_from_history( | |
| web_history = posted_history, | |
| total_records = length(pmid_chunks[[i]]), | |
| batch_size = batch_size, | |
| api_key = api_key, | |
| max_retries = max_retries, | |
| delay = delay, | |
| retrieve_abstract = retrieve_abstract | |
| ) | |
| } | |
| out <- do.call(rbind, all_chunks) | |
| if (is.null(out) || !is.data.frame(out) || nrow(out) == 0L) return(empty_pubmed_df()) | |
| out <- out[!duplicated(out$uid), , drop = FALSE] | |
| row.names(out) <- NULL | |
| return(out) | |
| } | |
| if (is.character(input_source) && length(input_source) == 1L) { | |
| query_string <- input_source[[1]] | |
| message("Input detected as a PubMed query string.") | |
| initial_search <- retry_entrez_call( | |
| fun = function() { | |
| rentrez::entrez_search(db = "pubmed", term = query_string, retmax = 0, api_key = api_key) | |
| }, | |
| context = "Initial PubMed count", | |
| max_retries = max_retries, | |
| delay = delay | |
| ) | |
| total_records <- as_int_safe(initial_search$count, 0L) | |
| message(sprintf("Initial PubMed count: %d record(s).", total_records)) | |
| if (total_records == 0L) return(empty_pubmed_df()) | |
| if (total_records <= 10000L) { | |
| return(fetch_data_for_query_chunk( | |
| query = query_string, | |
| batch_size = batch_size, | |
| api_key = api_key, | |
| max_retries = max_retries, | |
| delay = delay, | |
| retrieve_abstract = retrieve_abstract | |
| )) | |
| } | |
| message("More than 10,000 results found. Attempting date-based splitting.") | |
| date_info <- extract_date_range_from_query(query_string) | |
| start_date_obj <- parse_flexible_date(date_info$start_date, is_end_date = FALSE) | |
| end_date_obj <- parse_flexible_date(date_info$end_date, is_end_date = TRUE) | |
| if (is.na(start_date_obj) || is.na(end_date_obj)) { | |
| stop( | |
| "Could not parse the date range used for splitting. ", | |
| "Add an explicit publication-date filter to the query, for example ", | |
| "\"2020/01/01[Date - Publication] : 2025/12/31[Date - Publication]\".", | |
| call. = FALSE | |
| ) | |
| } | |
| out <- split_date_range( | |
| query_base = date_info$query_without_date, | |
| start_date_obj = start_date_obj, | |
| end_date_obj = end_date_obj, | |
| batch_size = batch_size, | |
| api_key = api_key, | |
| max_retries = max_retries, | |
| delay = delay, | |
| retrieve_abstract = retrieve_abstract | |
| ) | |
| out <- out[!duplicated(out$uid), , drop = FALSE] | |
| row.names(out) <- NULL | |
| return(out) | |
| } | |
| stop( | |
| "input_source must be either a single PubMed query string or a vector of numeric PMIDs.", | |
| call. = FALSE | |
| ) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment