Skip to content

Instantly share code, notes, and snippets.

@LukasWallrich
Created February 28, 2026 11:50
Show Gist options
  • Select an option

  • Save LukasWallrich/256753d59de4c74ae1cd8d808e744082 to your computer and use it in GitHub Desktop.

Select an option

Save LukasWallrich/256753d59de4c74ae1cd8d808e744082 to your computer and use it in GitHub Desktop.
PDF conversion action and pilot keyword screening script (public)
name: Continuous GROBID Conversion (Authenticated)
on:
workflow_dispatch: {}
#schedule:
# - cron: "0 */6 * * *"
permissions:
contents: write
jobs:
grobid-service-account:
runs-on: ubuntu-latest
steps:
- name: Free disk space
run: |
sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
/opt/hostedtoolcache/CodeQL /usr/local/share/boost \
"$AGENT_TOOLSDIRECTORY"
docker rmi $(docker images -aq) 2>/dev/null || true
- name: Start GROBID
run: |
docker run -d --name grobid -p 8070:8070 grobid/grobid:0.8.2.1-full
timeout 600s bash -c '
until curl -s localhost:8070/api/isalive > /dev/null; do
echo "Waiting for GROBID..."
sleep 5
done'
- name: Checkout repo
uses: actions/checkout@v4
- name: Setup R with r2u
uses: eddelbuettel/github-actions/r2u-setup@master
- name: Install R Packages
run: Rscript -e 'install.packages(c("googledrive", "jsonlite", "httr", "tools"))'
# --- NEW STEP: Write the Secret to a file for R to use ---
- name: Create Service Account File
run: echo '${{ secrets.GDRIVE_SA_JSON }}' > service_account.json
- name: Ensure directories exist
run: mkdir -p tei
- name: Run Processing Loop (Authenticated)
shell: Rscript {0}
run: |
library(googledrive)
library(jsonlite)
library(httr)
library(tools)
# --- CONFIGURATION ---
FOLDER_ID <- "18vChmJn4Q2biMmTbRWWgwkQd8KE002JR"
GROBID_URL <- "http://localhost:8070/api/processFulltextDocument"
JSON_FILE <- "processed.json"
# We can be more aggressive with batch size now that we are authenticated
BATCH_SIZE <- 50
TIME_LIMIT <- 5 * 60 * 60
MAX_RETRIES <- 3
system('git config user.name "github-actions"')
system('git config user.email "actions@github.com"')
# --- AUTHENTICATION (The fix) ---
# R reads the file we created in the previous YAML step
tryCatch({
drive_auth(path = "service_account.json")
message("Authentication successful.")
}, error = function(e) {
message("CRITICAL: Authentication failed.")
quit(status = 1)
})
# --- HELPER: SANITIZE FILENAME ---
clean_filename <- function(name, id) {
base <- file_path_sans_ext(name)
safe_base <- gsub("[^[:alnum:]\\.-]", "_", base)
candidate <- paste0("tei/", safe_base, ".tei.xml")
if (file.exists(candidate)) {
return(paste0("tei/", safe_base, "_", id, ".tei.xml"))
}
return(candidate)
}
# --- INITIALIZATION ---
start_time <- Sys.time()
if (file.exists(JSON_FILE)) {
processed_ids <- fromJSON(JSON_FILE)
} else {
processed_ids <- character(0)
}
message("Fetching file list...")
all_files <- drive_ls(as_id(FOLDER_ID), pattern = "\\.pdf$")
todo <- all_files[!(all_files$id %in% processed_ids), ]
total_todo <- nrow(todo)
if (total_todo == 0) {
message("All caught up!")
quit(status = 0)
}
message(paste("Queue size:", total_todo))
# --- LOOP ---
current_idx <- 1
total_processed_session <- 0
while(current_idx <= total_todo) {
if (as.numeric(difftime(Sys.time(), start_time, units="secs")) > TIME_LIMIT) break
end_idx <- min(current_idx + BATCH_SIZE - 1, total_todo)
batch_files <- todo[current_idx:end_idx, ]
message(paste0("\n--- Starting Batch: ", current_idx, " to ", end_idx, " ---"))
batch_processed_ids <- c()
for (i in 1:nrow(batch_files)) {
f_id <- batch_files$id[i]
f_name <- batch_files$name[i]
local_pdf <- "input.pdf"
final_output_name <- clean_filename(f_name, f_id)
download_success <- FALSE
# --- DOWNLOAD ---
for (attempt in 1:MAX_RETRIES) {
if (file.exists(local_pdf)) file.remove(local_pdf)
tryCatch({
drive_download(as_id(f_id), path = local_pdf, overwrite = TRUE, verbose = FALSE)
if (file.exists(local_pdf) && file.info(local_pdf)$size > 0) {
download_success <- TRUE
break
}
}, error = function(e) {
if (attempt < MAX_RETRIES) Sys.sleep(2 * attempt)
})
}
# --- PROCESS ---
if (download_success) {
tryCatch({
resp <- POST(
url = GROBID_URL,
body = list(input = upload_file(local_pdf)),
encode = "multipart"
)
if (status_code(resp) == 200) {
xml_content <- content(resp, "text", encoding = "UTF-8")
if (nchar(xml_content) > 100) {
writeLines(xml_content, final_output_name)
batch_processed_ids <- c(batch_processed_ids, f_id)
cat(".")
} else {
message(paste("\nEmpty Result:", f_name))
}
} else {
message(paste("\nGROBID Fail:", f_name))
}
}, error = function(e) {
message(paste("\nScript Error:", f_name))
})
} else {
message(paste("\nDownload Fail:", f_name))
}
if (file.exists(local_pdf)) file.remove(local_pdf)
}
# --- COMMIT ---
if (length(batch_processed_ids) > 0) {
current_processed <- if(file.exists(JSON_FILE)) fromJSON(JSON_FILE) else character(0)
updated_processed <- unique(c(current_processed, batch_processed_ids))
write_json(updated_processed, JSON_FILE)
system("git add tei/ processed.json")
system(paste0("git commit -m 'Processed batch: ", length(batch_processed_ids), " files'"))
system("git push")
total_processed_session <- total_processed_session + length(batch_processed_ids)
}
current_idx <- end_idx + 1
}
# Cleanup the secret file
- name: Cleanup Keys
if: always()
run: rm -f service_account.json
#!/usr/bin/env Rscript
suppressPackageStartupMessages({
library(dplyr)
library(purrr)
library(stringr)
library(tibble)
library(tidyr)
library(metacheck)
})
`%||%` <- function(x, y) if (is.null(x) || length(x) == 0 || all(is.na(x))) y else x
parse_args <- function(args) {
defaults <- list(
tei_dir = "tei",
out_csv = "pilot_keyword_screening.csv",
out_md = "pilot_keyword_screening_report.md",
max_files = NA_integer_
)
if (length(args) == 0) return(defaults)
for (arg in args) {
if (!str_detect(arg, "^--")) next
kv <- str_split_fixed(sub("^--", "", arg), "=", 2)
key <- kv[, 1]
val <- kv[, 2]
if (val == "") next
if (key == "tei-dir") defaults$tei_dir <- val
if (key == "out-csv") defaults$out_csv <- val
if (key == "out-md") defaults$out_md <- val
if (key == "max-files") defaults$max_files <- as.integer(val)
}
defaults
}
list_tei_files <- function(dir = "tei") {
list.files(dir, pattern = "\\.tei\\.xml$", full.names = TRUE)
}
extract_doi_from_filename <- function(path) {
stem <- basename(path) |>
str_remove("\\.tei\\.xml$") |>
str_remove("\\.xml$") |>
utils::URLdecode()
lower <- str_to_lower(stem)
prefix <- str_split_fixed(lower, "_-_", 2)[, 1]
# Expected filename DOI encoding uses '-' or '_' where DOI has '/'.
prefix <- str_replace(prefix, "^(10\\.[0-9]{4,9})_", "\\1/")
prefix <- str_replace(prefix, "^(10\\.[0-9]{4,9})-", "\\1/")
doi <- str_extract(prefix, "10\\.[0-9]{4,9}/[-._;()/:a-z0-9]+")
doi <- str_replace(doi %||% "", "[\\.,;:]+$", "")
if (nchar(doi) == 0) {
return(NA_character_)
}
doi
}
collect_full_text <- function(papers) {
paper_list <- if (inherits(papers, "scivrs_paperlist")) {
unname(papers)
} else {
list(papers)
}
map_dfr(paper_list, function(p) {
ft <- p$full_text
if (nrow(ft) == 0) {
return(tibble())
}
ft$file <- normalizePath(p$info$filename %||% p$id, winslash = "/", mustWork = FALSE)
ft
})
}
build_markdown_report <- function(
out_md,
tei_dir,
result,
keyword_summary,
section_summary,
keyword_section_summary,
unresolved_doi
) {
total <- nrow(result)
included <- sum(result$include)
excluded <- total - included
fmt_table <- function(df) {
if (nrow(df) == 0) {
return(c("_No hits found._", ""))
}
header <- paste0("| ", paste(names(df), collapse = " | "), " |")
sep <- paste0("|", paste(rep("---", ncol(df)), collapse = "|"), "|")
rows <- apply(df, 1, function(x) paste0("| ", paste(as.character(x), collapse = " | "), " |"))
c(header, sep, rows, "")
}
lines <- c(
"# Pilot Keyword Screening Report",
"",
paste0("Generated: ", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z")),
paste0("Input directory: `", tei_dir, "`"),
"",
"## Summary",
"",
paste0("- Total documents: ", total),
paste0("- Included (>=1 hit): ", included),
paste0("- Excluded (0 hits): ", excluded),
paste0("- Missing DOI parsed from filename: ", unresolved_doi),
"",
"## Hits Per Keyword",
""
)
lines <- c(lines, fmt_table(keyword_summary), "## Hits Per Section", "")
lines <- c(lines, fmt_table(section_summary), "## Hits Per Keyword And Section", "")
lines <- c(lines, fmt_table(keyword_section_summary))
writeLines(lines, con = out_md)
}
main <- function() {
args <- parse_args(commandArgs(trailingOnly = TRUE))
tei_files <- list_tei_files(args$tei_dir)
if (!is.na(args$max_files) && args$max_files > 0) {
tei_files <- head(sort(tei_files), args$max_files)
}
if (length(tei_files) == 0) {
stop("No .tei.xml files found in: ", args$tei_dir)
}
message("Reading TEI files with metacheck: ", length(tei_files), " files")
papers <- metacheck::read(tei_files)
full_text <- collect_full_text(papers)
file_map <- tibble(
file = normalizePath(tei_files, winslash = "/", mustWork = FALSE),
doi = map_chr(tei_files, extract_doi_from_filename)
) |>
mutate(doc_key = coalesce(doi, basename(file)))
full_text <- full_text |>
left_join(file_map |> select(file, doi, doc_key), by = "file")
keywords <- c("pilot", "feasibility study", "preliminary study")
keyword_re <- regex(str_c(keywords, collapse = "|"), ignore_case = TRUE)
hits <- full_text |>
filter(section != "references" | is.na(section)) |>
filter(str_detect(text, keyword_re)) |>
transmute(
doc_key,
doi,
section = coalesce(section, "unknown"),
keyword = str_extract_all(str_to_lower(text), keyword_re)
) |>
unnest_longer(keyword, keep_empty = FALSE)
per_doc_hits <- hits |>
group_by(doc_key, doi) |>
summarise(
keywords_found = str_c(sort(unique(keyword)), collapse = "; "),
sections_found = str_c(sort(unique(section)), collapse = "; "),
include = TRUE,
.groups = "drop"
)
result <- file_map |>
select(doc_key, doi) |>
distinct() |>
left_join(per_doc_hits, by = c("doc_key", "doi")) |>
mutate(
keywords_found = coalesce(keywords_found, ""),
sections_found = coalesce(sections_found, ""),
include = coalesce(include, FALSE)
) |>
select(doi, keywords_found, sections_found, include) |>
arrange(desc(include), doi)
keyword_summary <- hits |>
group_by(keyword) |>
summarise(
mentions = n(),
documents = n_distinct(doc_key),
.groups = "drop"
) |>
arrange(desc(documents), desc(mentions), keyword)
section_summary <- hits |>
group_by(section) |>
summarise(
mentions = n(),
documents = n_distinct(doc_key),
.groups = "drop"
) |>
arrange(desc(documents), desc(mentions), section)
keyword_section_summary <- hits |>
distinct(doc_key, keyword, section) |>
count(keyword, section, name = "documents") |>
arrange(desc(documents), keyword, section)
unresolved_doi <- sum(is.na(file_map$doi))
utils::write.csv(result, args$out_csv, row.names = FALSE, na = "")
build_markdown_report(
out_md = args$out_md,
tei_dir = args$tei_dir,
result = result,
keyword_summary = keyword_summary,
section_summary = section_summary,
keyword_section_summary = keyword_section_summary,
unresolved_doi = unresolved_doi
)
message("Wrote CSV: ", args$out_csv)
message("Wrote report: ", args$out_md)
}
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment