Skip to content

Instantly share code, notes, and snippets.

@mathzero
Last active March 10, 2026 16:40
Show Gist options
  • Select an option

  • Save mathzero/70f33fa243ba0bb4f46b72b8a13646e0 to your computer and use it in GitHub Desktop.

Select an option

Save mathzero/70f33fa243ba0bb4f46b72b8a13646e0 to your computer and use it in GitHub Desktop.
Programatically search PubMed
# Standalone PubMed literature search script
#
# Install packages once:
# install.packages(c("rentrez", "xml2"))
#
# Getting an NCBI API key:
# 1. Create or sign in to a My NCBI account:
# https://www.ncbi.nlm.nih.gov/account/
# 2. Open Account Settings and create a key in "API Key Management".
# 3. Save the key for your R session:
# Sys.setenv(ENTREZ_KEY = "your_key_here")
#
# Official NCBI references:
# - API key setup: https://www.ncbi.nlm.nih.gov/sites/books/NBK53593/
# - E-utilities usage limits: https://www.ncbi.nlm.nih.gov/books/NBK25497/
#
# Notes:
# - The script works without an API key, but NCBI allows fewer requests per second.
# - This script accepts either a PubMed query string or a vector of PMIDs.
# - Set retrieve_abstract = TRUE if you also want abstracts and affiliations.
#
# Example query:
# query <- paste(
# "\"atopic dermatitis\"[Title/Abstract]",
# "AND dupilumab[Title/Abstract]",
# "AND (2020/01/01[Date - Publication] : 2025/12/31[Date - Publication])"
# )
# results <- fetch_pubmed_data(query, retrieve_abstract = TRUE)
# head(results[, c("uid", "title", "pubdate", "fulljournalname", "doi")])
check_pubmed_packages <- function() {
missing_pkgs <- c()
if (!requireNamespace("rentrez", quietly = TRUE)) missing_pkgs <- c(missing_pkgs, "rentrez")
if (!requireNamespace("xml2", quietly = TRUE)) missing_pkgs <- c(missing_pkgs, "xml2")
if (length(missing_pkgs) > 0) {
stop(
"Please install the required package(s): ",
paste(missing_pkgs, collapse = ", "),
call. = FALSE
)
}
}
normalize_api_key <- function(api_key = NULL) {
if (is.null(api_key) || length(api_key) == 0L || is.na(api_key[[1]]) || !nzchar(api_key[[1]])) {
env_key <- Sys.getenv("ENTREZ_KEY", unset = "")
if (!nzchar(env_key)) return(NULL)
return(env_key)
}
api_key[[1]]
}
default_delay <- function(api_key, delay = NULL) {
if (!is.null(delay)) return(delay)
if (is.null(api_key)) return(0.35)
0.11
}
as_int_safe <- function(x, default = 0L) {
out <- suppressWarnings(as.integer(x))
if (length(out) == 0L || is.na(out[[1]])) return(as.integer(default))
out[[1]]
}
retry_entrez_call <- function(fun, context, max_retries = 3, delay = 0.35) {
attempt <- 0L
wait_time <- delay
repeat {
result <- tryCatch(fun(), error = identity)
if (!inherits(result, "error")) return(result)
attempt <- attempt + 1L
if (attempt > max_retries) {
stop("Maximum retries exceeded for ", context, ". Last error: ", conditionMessage(result), call. = FALSE)
}
message(context, " failed: ", conditionMessage(result))
message("Retrying in ", wait_time, " seconds...")
Sys.sleep(wait_time)
wait_time <- wait_time * 2
}
}
empty_pubmed_df <- function() {
data.frame(
uid = character(0),
pubdate = character(0),
epubdate = character(0),
source = character(0),
authors = character(0),
lastauthor = character(0),
title = character(0),
volume = character(0),
issue = character(0),
pages = character(0),
lang = character(0),
issn = character(0),
essn = character(0),
pubtype = character(0),
pmcrefcount = integer(0),
fulljournalname = character(0),
elocationid = character(0),
doi = character(0),
abstract = character(0),
affiliations = character(0),
stringsAsFactors = FALSE
)
}
empty_details_df <- function() {
data.frame(
uid = character(0),
abstract = character(0),
affiliations = character(0),
stringsAsFactors = FALSE
)
}
last_day_of_month <- function(date_obj) {
date_obj <- as.Date(format(date_obj, "%Y-%m-01"))
next_month_first_day <- seq(date_obj, length.out = 2, by = "1 month")[2]
next_month_first_day - 1
}
parse_flexible_date <- function(date_str, is_end_date = FALSE) {
date_str <- gsub('^"|"$', "", trimws(date_str))
formats_to_try <- c("%Y/%m/%d", "%Y-%m-%d", "%Y/%m", "%Y-%m", "%Y")
for (fmt in formats_to_try) {
parsed_date <- tryCatch(as.Date(date_str, format = fmt), error = function(e) NA)
if (!is.na(parsed_date)) {
if (fmt %in% c("%Y/%m", "%Y-%m") && isTRUE(is_end_date)) {
parsed_date <- last_day_of_month(parsed_date)
}
if (fmt == "%Y") {
parsed_date <- if (isTRUE(is_end_date)) {
as.Date(paste0(date_str, "-12-31"))
} else {
as.Date(paste0(date_str, "-01-01"))
}
}
return(parsed_date)
}
}
NA
}
clean_query_text <- function(x) {
x <- gsub("\\s+", " ", trimws(x))
x <- gsub("^AND\\s+", "", x, ignore.case = TRUE)
x <- gsub("\\s+AND$", "", x, ignore.case = TRUE)
trimws(x)
}
build_date_query <- function(query_base, start_date_obj, end_date_obj) {
date_clause <- paste0(
"(\"",
format(start_date_obj, "%Y/%m/%d"),
"\"[Date - Publication] : \"",
format(end_date_obj, "%Y/%m/%d"),
"\"[Date - Publication])"
)
query_base <- clean_query_text(query_base)
if (!nzchar(query_base)) return(date_clause)
paste(query_base, "AND", date_clause)
}
extract_date_range_from_query <- function(query_string) {
patterns <- c(
"\\(?\\s*\"?(\\d{4}(?:[/-]\\d{2})?(?:[/-]\\d{2})?)\"?\\s*\\[(?:Date\\s*-\\s*Publication|DP|PDat)\\]\\s*[:\\-]\\s*\"?(\\d{4}(?:[/-]\\d{2})?(?:[/-]\\d{2})?)\"?\\s*\\[(?:Date\\s*-\\s*Publication|DP|PDat)\\]\\s*\\)?"
)
for (pattern in patterns) {
matches <- regexec(pattern, query_string, perl = TRUE)
match_data <- regmatches(query_string, matches)[[1]]
if (length(match_data) >= 3L) {
matched_component <- match_data[[1]]
query_without_date <- clean_query_text(sub(matched_component, "", query_string, fixed = TRUE))
return(list(
found = TRUE,
matched_component = matched_component,
start_date = match_data[[2]],
end_date = match_data[[3]],
query_without_date = query_without_date
))
}
}
list(
found = FALSE,
matched_component = NULL,
start_date = "1900/01/01",
end_date = format(Sys.Date(), "%Y/%m/%d"),
query_without_date = clean_query_text(query_string)
)
}
safe_xml_text <- function(node) {
tryCatch(xml2::xml_text(node), error = function(e) NA_character_)
}
node_exists <- function(node) {
!is.null(node) && length(node) > 0L && !inherits(node, "xml_missing")
}
parse_pubmed_xml_details <- function(xml_content) {
if (!nzchar(xml_content)) return(empty_details_df())
xml_records <- xml2::read_xml(xml_content)
articles <- xml2::xml_find_all(xml_records, "//PubmedArticle")
if (length(articles) == 0L) return(empty_details_df())
rows <- vector("list", length(articles))
for (i in seq_along(articles)) {
rows[[i]] <- tryCatch({
article <- articles[[i]]
pmid_node <- xml2::xml_find_first(article, ".//MedlineCitation/PMID")
pmid_text <- if (node_exists(pmid_node)) safe_xml_text(pmid_node) else NA_character_
abstract_nodes <- xml2::xml_find_all(article, ".//Article/Abstract/AbstractText")
abstract_text <- if (length(abstract_nodes) > 0L) {
parts <- vapply(abstract_nodes, safe_xml_text, FUN.VALUE = character(1))
parts <- parts[!is.na(parts) & nzchar(trimws(parts))]
if (length(parts) > 0L) paste(parts, collapse = " ") else NA_character_
} else {
NA_character_
}
author_nodes <- xml2::xml_find_all(article, ".//Article/AuthorList/Author")
affiliations <- character(0)
if (length(author_nodes) > 0L) {
for (author_node in author_nodes) {
affiliation_nodes <- xml2::xml_find_all(
author_node,
".//AffiliationInfo/Affiliation | self::Author/Affiliation[not(ancestor::AffiliationInfo)]"
)
if (length(affiliation_nodes) > 0L) {
affils <- vapply(affiliation_nodes, safe_xml_text, FUN.VALUE = character(1))
affils <- affils[!is.na(affils) & nzchar(trimws(affils))]
affiliations <- c(affiliations, affils)
}
}
}
data.frame(
uid = pmid_text,
abstract = abstract_text,
affiliations = if (length(affiliations) > 0L) paste(unique(affiliations), collapse = " ## ") else NA_character_,
stringsAsFactors = FALSE
)
}, error = function(e) NULL)
}
rows <- Filter(Negate(is.null), rows)
if (length(rows) == 0L) return(empty_details_df())
do.call(rbind, rows)
}
summaries_to_df <- function(all_summaries) {
summary_records <- if (inherits(all_summaries, "esummary")) {
list(all_summaries)
} else if (inherits(all_summaries, "esummary_list")) {
unclass(all_summaries)
} else if (is.list(all_summaries)) {
all_summaries
} else {
list()
}
rows <- lapply(summary_records, function(x) {
if (!is.list(x)) return(NULL)
authors_list <- tryCatch({
if (!is.null(x$authors)) {
if (is.data.frame(x$authors) && nrow(x$authors) > 0L && "name" %in% names(x$authors)) {
paste(x$authors$name, collapse = "; ")
} else if (is.list(x$authors) && length(x$authors) > 0L) {
author_names <- vapply(
x$authors,
function(author) {
if (is.list(author) && "name" %in% names(author)) return(author$name)
if (is.character(author) && length(author) > 0L) return(author[[1]])
NA_character_
},
FUN.VALUE = character(1)
)
author_names <- author_names[!is.na(author_names) & nzchar(author_names)]
if (length(author_names) > 0L) {
paste(author_names, collapse = "; ")
} else {
NA_character_
}
} else if (is.atomic(x$authors) && length(x$authors) > 0L) {
author_values <- as.character(stats::na.omit(x$authors))
if (length(author_values) > 0L) {
paste(author_values, collapse = "; ")
} else {
NA_character_
}
} else {
NA_character_
}
} else {
NA_character_
}
}, error = function(e) NA_character_)
data.frame(
uid = if (!is.null(x$uid)) x$uid else NA_character_,
pubdate = if (!is.null(x$pubdate)) x$pubdate else NA_character_,
epubdate = if (!is.null(x$epubdate)) x$epubdate else NA_character_,
source = if (!is.null(x$source)) x$source else NA_character_,
authors = authors_list,
lastauthor = if (!is.null(x$lastauthor)) x$lastauthor else NA_character_,
title = if (!is.null(x$title)) x$title else NA_character_,
volume = if (!is.null(x$volume)) x$volume else NA_character_,
issue = if (!is.null(x$issue)) x$issue else NA_character_,
pages = if (!is.null(x$pages)) x$pages else NA_character_,
lang = if (!is.null(x$lang)) paste(x$lang, collapse = "; ") else NA_character_,
issn = if (!is.null(x$issn)) x$issn else NA_character_,
essn = if (!is.null(x$essn)) x$essn else NA_character_,
pubtype = if (!is.null(x$pubtype)) paste(x$pubtype, collapse = "; ") else NA_character_,
pmcrefcount = if (!is.null(x$pmcrefcount)) as_int_safe(x$pmcrefcount, NA_integer_) else NA_integer_,
fulljournalname = if (!is.null(x$fulljournalname)) x$fulljournalname else NA_character_,
elocationid = if (!is.null(x$elocationid)) x$elocationid else NA_character_,
doi = if (!is.null(x$articleids) && is.data.frame(x$articleids) &&
all(c("idtype", "value") %in% names(x$articleids))) {
doi_index <- which(x$articleids$idtype == "doi")
if (length(doi_index) > 0L) x$articleids$value[[doi_index[[1]]]] else NA_character_
} else {
NA_character_
},
stringsAsFactors = FALSE
)
})
rows <- Filter(Negate(is.null), rows)
if (length(rows) == 0L) return(empty_pubmed_df())
df <- as.data.frame(do.call(rbind, rows), stringsAsFactors = FALSE)
df$pmcrefcount <- suppressWarnings(as.integer(df$pmcrefcount))
df$abstract <- NA_character_
df$affiliations <- NA_character_
df
}
fetch_details_from_ids <- function(uid_vec,
api_key = NULL,
batch_size = 100L,
max_retries = 3,
delay = 0.35) {
uid_vec <- unique(as.character(uid_vec))
uid_vec <- uid_vec[!is.na(uid_vec) & nzchar(uid_vec)]
if (length(uid_vec) == 0L) return(empty_details_df())
api_key <- normalize_api_key(api_key)
delay <- default_delay(api_key, delay)
batch_size <- max(20L, min(200L, as_int_safe(batch_size, 100L)))
id_batches <- split(uid_vec, ceiling(seq_along(uid_vec) / batch_size))
details <- vector("list", length(id_batches))
for (i in seq_along(id_batches)) {
id_chunk <- id_batches[[i]]
message(sprintf("Fetching abstract batch %d/%d by PMID...", i, length(id_batches)))
xml_content <- retry_entrez_call(
fun = function() {
posted_history <- rentrez::entrez_post(db = "pubmed", id = id_chunk, api_key = api_key)
rentrez::entrez_fetch(
db = "pubmed",
web_history = posted_history,
rettype = "xml",
retstart = 0,
retmax = length(id_chunk),
parsed = FALSE,
api_key = api_key
)
},
context = sprintf("PMID XML fetch batch %d/%d", i, length(id_batches)),
max_retries = max_retries,
delay = delay
)
details[[i]] <- parse_pubmed_xml_details(xml_content)
}
out <- do.call(rbind, details)
if (is.null(out) || !is.data.frame(out) || nrow(out) == 0L) return(empty_details_df())
out <- out[!is.na(out$uid) & nzchar(out$uid), , drop = FALSE]
row.names(out) <- NULL
out
}
fetch_details_from_history <- function(web_history,
total_records,
api_key = NULL,
batch_size = 200L,
max_retries = 3,
delay = 0.35) {
if (total_records <= 0L) return(empty_details_df())
api_key <- normalize_api_key(api_key)
delay <- default_delay(api_key, delay)
starts <- seq(0, total_records - 1L, by = batch_size)
details <- vector("list", length(starts))
for (i in seq_along(starts)) {
retstart <- starts[[i]]
message(sprintf("Fetching abstract batch %d/%d from PubMed history...", i, length(starts)))
xml_content <- retry_entrez_call(
fun = function() {
rentrez::entrez_fetch(
db = "pubmed",
web_history = web_history,
rettype = "xml",
retstart = retstart,
retmax = batch_size,
parsed = FALSE,
api_key = api_key
)
},
context = sprintf("History XML fetch batch %d/%d", i, length(starts)),
max_retries = max_retries,
delay = delay
)
details[[i]] <- parse_pubmed_xml_details(xml_content)
}
out <- do.call(rbind, details)
if (is.null(out) || !is.data.frame(out) || nrow(out) == 0L) return(empty_details_df())
out <- out[!is.na(out$uid) & nzchar(out$uid), , drop = FALSE]
row.names(out) <- NULL
out
}
process_records_from_history <- function(web_history,
total_records,
batch_size = 500L,
api_key = NULL,
max_retries = 3,
delay = NULL,
retrieve_abstract = FALSE) {
if (total_records <= 0L) return(empty_pubmed_df())
api_key <- normalize_api_key(api_key)
delay <- default_delay(api_key, delay)
batch_size <- max(1L, min(as_int_safe(batch_size, 500L), 500L))
starts <- seq(0, total_records - 1L, by = batch_size)
summaries_list <- vector("list", length(starts))
for (i in seq_along(starts)) {
retstart <- starts[[i]]
message(sprintf("Fetching summary batch %d/%d...", i, length(starts)))
summaries_list[[i]] <- retry_entrez_call(
fun = function() {
rentrez::entrez_summary(
db = "pubmed",
web_history = web_history,
retstart = retstart,
retmax = batch_size,
api_key = api_key
)
},
context = sprintf("Summary fetch batch %d/%d", i, length(starts)),
max_retries = max_retries,
delay = delay
)
}
summary_dfs <- lapply(summaries_list, summaries_to_df)
summary_dfs <- Filter(function(x) is.data.frame(x) && nrow(x) > 0L, summary_dfs)
if (length(summary_dfs) == 0L) return(empty_pubmed_df())
df <- as.data.frame(do.call(rbind, summary_dfs), stringsAsFactors = FALSE)
row.names(df) <- NULL
if (isTRUE(retrieve_abstract)) {
details_df <- tryCatch(
fetch_details_from_history(
web_history = web_history,
total_records = total_records,
api_key = api_key,
batch_size = 200L,
max_retries = max_retries,
delay = delay
),
error = function(e) {
message("History-based abstract retrieval failed. Falling back to PMID batches.")
fetch_details_from_ids(
uid_vec = df$uid,
api_key = api_key,
batch_size = 100L,
max_retries = max_retries,
delay = delay
)
}
)
if (is.data.frame(details_df) && nrow(details_df) > 0L) {
details_df$uid <- as.character(details_df$uid)
df$uid <- as.character(df$uid)
row_index <- match(df$uid, details_df$uid)
matched <- !is.na(row_index)
df$abstract[matched] <- details_df$abstract[row_index[matched]]
df$affiliations[matched] <- details_df$affiliations[row_index[matched]]
}
}
df <- df[!duplicated(df$uid), , drop = FALSE]
row.names(df) <- NULL
df
}
fetch_data_for_query_chunk <- function(query,
batch_size = 500L,
api_key = NULL,
max_retries = 3,
delay = NULL,
retrieve_abstract = FALSE,
label = NULL) {
api_key <- normalize_api_key(api_key)
delay <- default_delay(api_key, delay)
search_results <- retry_entrez_call(
fun = function() {
rentrez::entrez_search(
db = "pubmed",
term = query,
use_history = TRUE,
retmax = 0,
api_key = api_key
)
},
context = if (is.null(label)) "PubMed search" else paste0("PubMed search: ", label),
max_retries = max_retries,
delay = delay
)
total_records <- as_int_safe(search_results$count, 0L)
label_text <- if (is.null(label)) query else label
message(sprintf("%s matched %d record(s).", label_text, total_records))
if (total_records == 0L) return(empty_pubmed_df())
process_records_from_history(
web_history = search_results$web_history,
total_records = total_records,
batch_size = batch_size,
api_key = api_key,
max_retries = max_retries,
delay = delay,
retrieve_abstract = retrieve_abstract
)
}
split_date_range <- function(query_base,
start_date_obj,
end_date_obj,
batch_size = 500L,
api_key = NULL,
max_retries = 3,
delay = NULL,
retrieve_abstract = FALSE) {
api_key <- normalize_api_key(api_key)
delay <- default_delay(api_key, delay)
interval_query <- build_date_query(query_base, start_date_obj, end_date_obj)
search_results <- retry_entrez_call(
fun = function() {
rentrez::entrez_search(db = "pubmed", term = interval_query, retmax = 0, api_key = api_key)
},
context = sprintf(
"Date-range count %s to %s",
format(start_date_obj, "%Y/%m/%d"),
format(end_date_obj, "%Y/%m/%d")
),
max_retries = max_retries,
delay = delay
)
sub_total <- as_int_safe(search_results$count, 0L)
interval_label <- sprintf(
"Date range %s to %s",
format(start_date_obj, "%Y/%m/%d"),
format(end_date_obj, "%Y/%m/%d")
)
if (sub_total == 0L) {
message(interval_label, ": 0 records found.")
return(empty_pubmed_df())
}
if (sub_total <= 10000L) {
return(fetch_data_for_query_chunk(
query = interval_query,
batch_size = batch_size,
api_key = api_key,
max_retries = max_retries,
delay = delay,
retrieve_abstract = retrieve_abstract,
label = interval_label
))
}
message(interval_label, ": ", sub_total, " records found. Splitting further.")
mid_point_days <- floor(as.numeric(end_date_obj - start_date_obj) / 2)
if (mid_point_days == 0) {
months_seq <- seq.Date(start_date_obj, end_date_obj, by = "1 month")
if (length(months_seq) <= 1L || (length(months_seq) == 2L && months_seq[[2]] > end_date_obj)) {
warning(
"Date range ",
format(start_date_obj, "%Y/%m/%d"),
" to ",
format(end_date_obj, "%Y/%m/%d"),
" still returns >10,000 records and cannot be split further."
)
return(fetch_data_for_query_chunk(
query = interval_query,
batch_size = batch_size,
api_key = api_key,
max_retries = max_retries,
delay = delay,
retrieve_abstract = retrieve_abstract,
label = interval_label
))
}
combined_df <- empty_pubmed_df()
current_start <- start_date_obj
while (current_start <= end_date_obj) {
sub_end <- min(last_day_of_month(current_start), end_date_obj)
df_sub <- split_date_range(
query_base = query_base,
start_date_obj = current_start,
end_date_obj = sub_end,
batch_size = batch_size,
api_key = api_key,
max_retries = max_retries,
delay = delay,
retrieve_abstract = retrieve_abstract
)
combined_df <- rbind(combined_df, df_sub)
current_start <- sub_end + 1
}
return(combined_df)
}
mid_date_obj <- start_date_obj + mid_point_days
df_first_half <- split_date_range(
query_base = query_base,
start_date_obj = start_date_obj,
end_date_obj = mid_date_obj,
batch_size = batch_size,
api_key = api_key,
max_retries = max_retries,
delay = delay,
retrieve_abstract = retrieve_abstract
)
df_second_half <- split_date_range(
query_base = query_base,
start_date_obj = mid_date_obj + 1,
end_date_obj = end_date_obj,
batch_size = batch_size,
api_key = api_key,
max_retries = max_retries,
delay = delay,
retrieve_abstract = retrieve_abstract
)
out <- rbind(df_first_half, df_second_half)
out <- out[!duplicated(out$uid), , drop = FALSE]
row.names(out) <- NULL
out
}
fetch_pubmed_data <- function(input_source,
batch_size = 500,
api_key = NULL,
max_retries = 3,
delay = NULL,
retrieve_abstract = FALSE) {
check_pubmed_packages()
api_key <- normalize_api_key(api_key)
delay <- default_delay(api_key, delay)
batch_size <- max(1L, as_int_safe(batch_size, 500L))
max_retries <- max(0L, as_int_safe(max_retries, 3L))
is_pmid_vector <- ((is.numeric(input_source) &&
all(!is.na(input_source)) &&
all(input_source == floor(input_source))) ||
(is.character(input_source) && all(grepl("^[0-9]+$", input_source)))) &&
length(input_source) > 0L
if (isTRUE(is_pmid_vector)) {
pmid_list <- unique(as.character(input_source))
pmid_list <- pmid_list[!is.na(pmid_list) & grepl("^[0-9]+$", pmid_list)]
if (length(pmid_list) == 0L) return(empty_pubmed_df())
message(sprintf("Input detected as PMIDs (%d record(s)).", length(pmid_list)))
chunk_size <- 2000L
pmid_chunks <- split(pmid_list, ceiling(seq_along(pmid_list) / chunk_size))
all_chunks <- vector("list", length(pmid_chunks))
for (i in seq_along(pmid_chunks)) {
message(sprintf("Posting PMID chunk %d/%d...", i, length(pmid_chunks)))
posted_history <- retry_entrez_call(
fun = function() {
rentrez::entrez_post(db = "pubmed", id = pmid_chunks[[i]], api_key = api_key)
},
context = sprintf("PMID post chunk %d/%d", i, length(pmid_chunks)),
max_retries = max_retries,
delay = delay
)
all_chunks[[i]] <- process_records_from_history(
web_history = posted_history,
total_records = length(pmid_chunks[[i]]),
batch_size = batch_size,
api_key = api_key,
max_retries = max_retries,
delay = delay,
retrieve_abstract = retrieve_abstract
)
}
out <- do.call(rbind, all_chunks)
if (is.null(out) || !is.data.frame(out) || nrow(out) == 0L) return(empty_pubmed_df())
out <- out[!duplicated(out$uid), , drop = FALSE]
row.names(out) <- NULL
return(out)
}
if (is.character(input_source) && length(input_source) == 1L) {
query_string <- input_source[[1]]
message("Input detected as a PubMed query string.")
initial_search <- retry_entrez_call(
fun = function() {
rentrez::entrez_search(db = "pubmed", term = query_string, retmax = 0, api_key = api_key)
},
context = "Initial PubMed count",
max_retries = max_retries,
delay = delay
)
total_records <- as_int_safe(initial_search$count, 0L)
message(sprintf("Initial PubMed count: %d record(s).", total_records))
if (total_records == 0L) return(empty_pubmed_df())
if (total_records <= 10000L) {
return(fetch_data_for_query_chunk(
query = query_string,
batch_size = batch_size,
api_key = api_key,
max_retries = max_retries,
delay = delay,
retrieve_abstract = retrieve_abstract
))
}
message("More than 10,000 results found. Attempting date-based splitting.")
date_info <- extract_date_range_from_query(query_string)
start_date_obj <- parse_flexible_date(date_info$start_date, is_end_date = FALSE)
end_date_obj <- parse_flexible_date(date_info$end_date, is_end_date = TRUE)
if (is.na(start_date_obj) || is.na(end_date_obj)) {
stop(
"Could not parse the date range used for splitting. ",
"Add an explicit publication-date filter to the query, for example ",
"\"2020/01/01[Date - Publication] : 2025/12/31[Date - Publication]\".",
call. = FALSE
)
}
out <- split_date_range(
query_base = date_info$query_without_date,
start_date_obj = start_date_obj,
end_date_obj = end_date_obj,
batch_size = batch_size,
api_key = api_key,
max_retries = max_retries,
delay = delay,
retrieve_abstract = retrieve_abstract
)
out <- out[!duplicated(out$uid), , drop = FALSE]
row.names(out) <- NULL
return(out)
}
stop(
"input_source must be either a single PubMed query string or a vector of numeric PMIDs.",
call. = FALSE
)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment