Created
February 28, 2026 11:50
-
-
Save LukasWallrich/256753d59de4c74ae1cd8d808e744082 to your computer and use it in GitHub Desktop.
PDF conversion action and pilot keyword screening script (public)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Continuous GROBID Conversion (Authenticated) | |
| on: | |
| workflow_dispatch: {} | |
| #schedule: | |
| # - cron: "0 */6 * * *" | |
| permissions: | |
| contents: write | |
| jobs: | |
| grobid-service-account: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Free disk space | |
| run: | | |
| sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \ | |
| /opt/hostedtoolcache/CodeQL /usr/local/share/boost \ | |
| "$AGENT_TOOLSDIRECTORY" | |
| docker rmi $(docker images -aq) 2>/dev/null || true | |
| - name: Start GROBID | |
| run: | | |
| docker run -d --name grobid -p 8070:8070 grobid/grobid:0.8.2.1-full | |
| timeout 600s bash -c ' | |
| until curl -s localhost:8070/api/isalive > /dev/null; do | |
| echo "Waiting for GROBID..." | |
| sleep 5 | |
| done' | |
| - name: Checkout repo | |
| uses: actions/checkout@v4 | |
| - name: Setup R with r2u | |
| uses: eddelbuettel/github-actions/r2u-setup@master | |
| - name: Install R Packages | |
| run: Rscript -e 'install.packages(c("googledrive", "jsonlite", "httr", "tools"))' | |
| # --- NEW STEP: Write the Secret to a file for R to use --- | |
| - name: Create Service Account File | |
| run: echo '${{ secrets.GDRIVE_SA_JSON }}' > service_account.json | |
| - name: Ensure directories exist | |
| run: mkdir -p tei | |
| - name: Run Processing Loop (Authenticated) | |
| shell: Rscript {0} | |
| run: | | |
| library(googledrive) | |
| library(jsonlite) | |
| library(httr) | |
| library(tools) | |
| # --- CONFIGURATION --- | |
| FOLDER_ID <- "18vChmJn4Q2biMmTbRWWgwkQd8KE002JR" | |
| GROBID_URL <- "http://localhost:8070/api/processFulltextDocument" | |
| JSON_FILE <- "processed.json" | |
| # We can be more aggressive with batch size now that we are authenticated | |
| BATCH_SIZE <- 50 | |
| TIME_LIMIT <- 5 * 60 * 60 | |
| MAX_RETRIES <- 3 | |
| system('git config user.name "github-actions"') | |
| system('git config user.email "actions@github.com"') | |
| # --- AUTHENTICATION (The fix) --- | |
| # R reads the file we created in the previous YAML step | |
| tryCatch({ | |
| drive_auth(path = "service_account.json") | |
| message("Authentication successful.") | |
| }, error = function(e) { | |
| message("CRITICAL: Authentication failed.") | |
| quit(status = 1) | |
| }) | |
| # --- HELPER: SANITIZE FILENAME --- | |
| clean_filename <- function(name, id) { | |
| base <- file_path_sans_ext(name) | |
| safe_base <- gsub("[^[:alnum:]\\.-]", "_", base) | |
| candidate <- paste0("tei/", safe_base, ".tei.xml") | |
| if (file.exists(candidate)) { | |
| return(paste0("tei/", safe_base, "_", id, ".tei.xml")) | |
| } | |
| return(candidate) | |
| } | |
| # --- INITIALIZATION --- | |
| start_time <- Sys.time() | |
| if (file.exists(JSON_FILE)) { | |
| processed_ids <- fromJSON(JSON_FILE) | |
| } else { | |
| processed_ids <- character(0) | |
| } | |
| message("Fetching file list...") | |
| all_files <- drive_ls(as_id(FOLDER_ID), pattern = "\\.pdf$") | |
| todo <- all_files[!(all_files$id %in% processed_ids), ] | |
| total_todo <- nrow(todo) | |
| if (total_todo == 0) { | |
| message("All caught up!") | |
| quit(status = 0) | |
| } | |
| message(paste("Queue size:", total_todo)) | |
| # --- LOOP --- | |
| current_idx <- 1 | |
| total_processed_session <- 0 | |
| while(current_idx <= total_todo) { | |
| if (as.numeric(difftime(Sys.time(), start_time, units="secs")) > TIME_LIMIT) break | |
| end_idx <- min(current_idx + BATCH_SIZE - 1, total_todo) | |
| batch_files <- todo[current_idx:end_idx, ] | |
| message(paste0("\n--- Starting Batch: ", current_idx, " to ", end_idx, " ---")) | |
| batch_processed_ids <- c() | |
| for (i in 1:nrow(batch_files)) { | |
| f_id <- batch_files$id[i] | |
| f_name <- batch_files$name[i] | |
| local_pdf <- "input.pdf" | |
| final_output_name <- clean_filename(f_name, f_id) | |
| download_success <- FALSE | |
| # --- DOWNLOAD --- | |
| for (attempt in 1:MAX_RETRIES) { | |
| if (file.exists(local_pdf)) file.remove(local_pdf) | |
| tryCatch({ | |
| drive_download(as_id(f_id), path = local_pdf, overwrite = TRUE, verbose = FALSE) | |
| if (file.exists(local_pdf) && file.info(local_pdf)$size > 0) { | |
| download_success <- TRUE | |
| break | |
| } | |
| }, error = function(e) { | |
| if (attempt < MAX_RETRIES) Sys.sleep(2 * attempt) | |
| }) | |
| } | |
| # --- PROCESS --- | |
| if (download_success) { | |
| tryCatch({ | |
| resp <- POST( | |
| url = GROBID_URL, | |
| body = list(input = upload_file(local_pdf)), | |
| encode = "multipart" | |
| ) | |
| if (status_code(resp) == 200) { | |
| xml_content <- content(resp, "text", encoding = "UTF-8") | |
| if (nchar(xml_content) > 100) { | |
| writeLines(xml_content, final_output_name) | |
| batch_processed_ids <- c(batch_processed_ids, f_id) | |
| cat(".") | |
| } else { | |
| message(paste("\nEmpty Result:", f_name)) | |
| } | |
| } else { | |
| message(paste("\nGROBID Fail:", f_name)) | |
| } | |
| }, error = function(e) { | |
| message(paste("\nScript Error:", f_name)) | |
| }) | |
| } else { | |
| message(paste("\nDownload Fail:", f_name)) | |
| } | |
| if (file.exists(local_pdf)) file.remove(local_pdf) | |
| } | |
| # --- COMMIT --- | |
| if (length(batch_processed_ids) > 0) { | |
| current_processed <- if(file.exists(JSON_FILE)) fromJSON(JSON_FILE) else character(0) | |
| updated_processed <- unique(c(current_processed, batch_processed_ids)) | |
| write_json(updated_processed, JSON_FILE) | |
| system("git add tei/ processed.json") | |
| system(paste0("git commit -m 'Processed batch: ", length(batch_processed_ids), " files'")) | |
| system("git push") | |
| total_processed_session <- total_processed_session + length(batch_processed_ids) | |
| } | |
| current_idx <- end_idx + 1 | |
| } | |
| # Cleanup the secret file | |
| - name: Cleanup Keys | |
| if: always() | |
| run: rm -f service_account.json |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env Rscript | |
| suppressPackageStartupMessages({ | |
| library(dplyr) | |
| library(purrr) | |
| library(stringr) | |
| library(tibble) | |
| library(tidyr) | |
| library(metacheck) | |
| }) | |
| `%||%` <- function(x, y) if (is.null(x) || length(x) == 0 || all(is.na(x))) y else x | |
| parse_args <- function(args) { | |
| defaults <- list( | |
| tei_dir = "tei", | |
| out_csv = "pilot_keyword_screening.csv", | |
| out_md = "pilot_keyword_screening_report.md", | |
| max_files = NA_integer_ | |
| ) | |
| if (length(args) == 0) return(defaults) | |
| for (arg in args) { | |
| if (!str_detect(arg, "^--")) next | |
| kv <- str_split_fixed(sub("^--", "", arg), "=", 2) | |
| key <- kv[, 1] | |
| val <- kv[, 2] | |
| if (val == "") next | |
| if (key == "tei-dir") defaults$tei_dir <- val | |
| if (key == "out-csv") defaults$out_csv <- val | |
| if (key == "out-md") defaults$out_md <- val | |
| if (key == "max-files") defaults$max_files <- as.integer(val) | |
| } | |
| defaults | |
| } | |
| list_tei_files <- function(dir = "tei") { | |
| list.files(dir, pattern = "\\.tei\\.xml$", full.names = TRUE) | |
| } | |
| extract_doi_from_filename <- function(path) { | |
| stem <- basename(path) |> | |
| str_remove("\\.tei\\.xml$") |> | |
| str_remove("\\.xml$") |> | |
| utils::URLdecode() | |
| lower <- str_to_lower(stem) | |
| prefix <- str_split_fixed(lower, "_-_", 2)[, 1] | |
| # Expected filename DOI encoding uses '-' or '_' where DOI has '/'. | |
| prefix <- str_replace(prefix, "^(10\\.[0-9]{4,9})_", "\\1/") | |
| prefix <- str_replace(prefix, "^(10\\.[0-9]{4,9})-", "\\1/") | |
| doi <- str_extract(prefix, "10\\.[0-9]{4,9}/[-._;()/:a-z0-9]+") | |
| doi <- str_replace(doi %||% "", "[\\.,;:]+$", "") | |
| if (nchar(doi) == 0) { | |
| return(NA_character_) | |
| } | |
| doi | |
| } | |
| collect_full_text <- function(papers) { | |
| paper_list <- if (inherits(papers, "scivrs_paperlist")) { | |
| unname(papers) | |
| } else { | |
| list(papers) | |
| } | |
| map_dfr(paper_list, function(p) { | |
| ft <- p$full_text | |
| if (nrow(ft) == 0) { | |
| return(tibble()) | |
| } | |
| ft$file <- normalizePath(p$info$filename %||% p$id, winslash = "/", mustWork = FALSE) | |
| ft | |
| }) | |
| } | |
| build_markdown_report <- function( | |
| out_md, | |
| tei_dir, | |
| result, | |
| keyword_summary, | |
| section_summary, | |
| keyword_section_summary, | |
| unresolved_doi | |
| ) { | |
| total <- nrow(result) | |
| included <- sum(result$include) | |
| excluded <- total - included | |
| fmt_table <- function(df) { | |
| if (nrow(df) == 0) { | |
| return(c("_No hits found._", "")) | |
| } | |
| header <- paste0("| ", paste(names(df), collapse = " | "), " |") | |
| sep <- paste0("|", paste(rep("---", ncol(df)), collapse = "|"), "|") | |
| rows <- apply(df, 1, function(x) paste0("| ", paste(as.character(x), collapse = " | "), " |")) | |
| c(header, sep, rows, "") | |
| } | |
| lines <- c( | |
| "# Pilot Keyword Screening Report", | |
| "", | |
| paste0("Generated: ", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z")), | |
| paste0("Input directory: `", tei_dir, "`"), | |
| "", | |
| "## Summary", | |
| "", | |
| paste0("- Total documents: ", total), | |
| paste0("- Included (>=1 hit): ", included), | |
| paste0("- Excluded (0 hits): ", excluded), | |
| paste0("- Missing DOI parsed from filename: ", unresolved_doi), | |
| "", | |
| "## Hits Per Keyword", | |
| "" | |
| ) | |
| lines <- c(lines, fmt_table(keyword_summary), "## Hits Per Section", "") | |
| lines <- c(lines, fmt_table(section_summary), "## Hits Per Keyword And Section", "") | |
| lines <- c(lines, fmt_table(keyword_section_summary)) | |
| writeLines(lines, con = out_md) | |
| } | |
| main <- function() { | |
| args <- parse_args(commandArgs(trailingOnly = TRUE)) | |
| tei_files <- list_tei_files(args$tei_dir) | |
| if (!is.na(args$max_files) && args$max_files > 0) { | |
| tei_files <- head(sort(tei_files), args$max_files) | |
| } | |
| if (length(tei_files) == 0) { | |
| stop("No .tei.xml files found in: ", args$tei_dir) | |
| } | |
| message("Reading TEI files with metacheck: ", length(tei_files), " files") | |
| papers <- metacheck::read(tei_files) | |
| full_text <- collect_full_text(papers) | |
| file_map <- tibble( | |
| file = normalizePath(tei_files, winslash = "/", mustWork = FALSE), | |
| doi = map_chr(tei_files, extract_doi_from_filename) | |
| ) |> | |
| mutate(doc_key = coalesce(doi, basename(file))) | |
| full_text <- full_text |> | |
| left_join(file_map |> select(file, doi, doc_key), by = "file") | |
| keywords <- c("pilot", "feasibility study", "preliminary study") | |
| keyword_re <- regex(str_c(keywords, collapse = "|"), ignore_case = TRUE) | |
| hits <- full_text |> | |
| filter(section != "references" | is.na(section)) |> | |
| filter(str_detect(text, keyword_re)) |> | |
| transmute( | |
| doc_key, | |
| doi, | |
| section = coalesce(section, "unknown"), | |
| keyword = str_extract_all(str_to_lower(text), keyword_re) | |
| ) |> | |
| unnest_longer(keyword, keep_empty = FALSE) | |
| per_doc_hits <- hits |> | |
| group_by(doc_key, doi) |> | |
| summarise( | |
| keywords_found = str_c(sort(unique(keyword)), collapse = "; "), | |
| sections_found = str_c(sort(unique(section)), collapse = "; "), | |
| include = TRUE, | |
| .groups = "drop" | |
| ) | |
| result <- file_map |> | |
| select(doc_key, doi) |> | |
| distinct() |> | |
| left_join(per_doc_hits, by = c("doc_key", "doi")) |> | |
| mutate( | |
| keywords_found = coalesce(keywords_found, ""), | |
| sections_found = coalesce(sections_found, ""), | |
| include = coalesce(include, FALSE) | |
| ) |> | |
| select(doi, keywords_found, sections_found, include) |> | |
| arrange(desc(include), doi) | |
| keyword_summary <- hits |> | |
| group_by(keyword) |> | |
| summarise( | |
| mentions = n(), | |
| documents = n_distinct(doc_key), | |
| .groups = "drop" | |
| ) |> | |
| arrange(desc(documents), desc(mentions), keyword) | |
| section_summary <- hits |> | |
| group_by(section) |> | |
| summarise( | |
| mentions = n(), | |
| documents = n_distinct(doc_key), | |
| .groups = "drop" | |
| ) |> | |
| arrange(desc(documents), desc(mentions), section) | |
| keyword_section_summary <- hits |> | |
| distinct(doc_key, keyword, section) |> | |
| count(keyword, section, name = "documents") |> | |
| arrange(desc(documents), keyword, section) | |
| unresolved_doi <- sum(is.na(file_map$doi)) | |
| utils::write.csv(result, args$out_csv, row.names = FALSE, na = "") | |
| build_markdown_report( | |
| out_md = args$out_md, | |
| tei_dir = args$tei_dir, | |
| result = result, | |
| keyword_summary = keyword_summary, | |
| section_summary = section_summary, | |
| keyword_section_summary = keyword_section_summary, | |
| unresolved_doi = unresolved_doi | |
| ) | |
| message("Wrote CSV: ", args$out_csv) | |
| message("Wrote report: ", args$out_md) | |
| } | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment