LukasWallrich/convert_pdfs.yml

## convert_pdfs.yml
name: Continuous GROBID Conversion (Authenticated)

on:
  workflow_dispatch: {}
  #schedule:
  #  - cron: "0 */6 * * *"

permissions:
  contents: write

jobs:
  grobid-service-account:
    runs-on: ubuntu-latest
    steps:
      - name: Free disk space
        run: |
          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
                      /opt/hostedtoolcache/CodeQL /usr/local/share/boost \
                      "$AGENT_TOOLSDIRECTORY"
          docker rmi $(docker images -aq) 2>/dev/null || true

      - name: Start GROBID
        run: |
          docker run -d --name grobid -p 8070:8070 grobid/grobid:0.8.2.1-full
          timeout 600s bash -c '
            until curl -s localhost:8070/api/isalive > /dev/null; do
              echo "Waiting for GROBID..."
              sleep 5
            done'

      - name: Checkout repo
        uses: actions/checkout@v4

      - name: Setup R with r2u
        uses: eddelbuettel/github-actions/r2u-setup@master

      - name: Install R Packages
        run: Rscript -e 'install.packages(c("googledrive", "jsonlite", "httr", "tools"))'

      # --- NEW STEP: Write the Secret to a file for R to use ---
      - name: Create Service Account File
        run: echo '${{ secrets.GDRIVE_SA_JSON }}' > service_account.json

      - name: Ensure directories exist
        run: mkdir -p tei

      - name: Run Processing Loop (Authenticated)
        shell: Rscript {0}
        run: |
          library(googledrive)
          library(jsonlite)
          library(httr)
          library(tools)

          # --- CONFIGURATION ---
          FOLDER_ID   <- "18vChmJn4Q2biMmTbRWWgwkQd8KE002JR"
          GROBID_URL  <- "http://localhost:8070/api/processFulltextDocument"
          JSON_FILE   <- "processed.json"
          # We can be more aggressive with batch size now that we are authenticated
          BATCH_SIZE  <- 50
          TIME_LIMIT  <- 5 * 60 * 60
          MAX_RETRIES <- 3

          system('git config user.name "github-actions"')
          system('git config user.email "actions@github.com"')

          # --- AUTHENTICATION (The fix) ---
          # R reads the file we created in the previous YAML step
          tryCatch({
            drive_auth(path = "service_account.json")
            message("Authentication successful.")
          }, error = function(e) {
            message("CRITICAL: Authentication failed.")
            quit(status = 1)
          })

          # --- HELPER: SANITIZE FILENAME ---
          clean_filename <- function(name, id) {
             base <- file_path_sans_ext(name)
             safe_base <- gsub("[^[:alnum:]\\.-]", "_", base)
             candidate <- paste0("tei/", safe_base, ".tei.xml")
             if (file.exists(candidate)) {
                return(paste0("tei/", safe_base, "_", id, ".tei.xml"))
             }
             return(candidate)
          }

          # --- INITIALIZATION ---
          start_time <- Sys.time()

          if (file.exists(JSON_FILE)) {
            processed_ids <- fromJSON(JSON_FILE)
          } else {
            processed_ids <- character(0)
          }

          message("Fetching file list...")
          all_files <- drive_ls(as_id(FOLDER_ID), pattern = "\\.pdf$")

          todo <- all_files[!(all_files$id %in% processed_ids), ]
          total_todo <- nrow(todo)

          if (total_todo == 0) {
            message("All caught up!")
            quit(status = 0)
          }
          message(paste("Queue size:", total_todo))

          # --- LOOP ---
          current_idx <- 1
          total_processed_session <- 0

          while(current_idx <= total_todo) {

            if (as.numeric(difftime(Sys.time(), start_time, units="secs")) > TIME_LIMIT) break

            end_idx <- min(current_idx + BATCH_SIZE - 1, total_todo)
            batch_files <- todo[current_idx:end_idx, ]

            message(paste0("\n--- Starting Batch: ", current_idx, " to ", end_idx, " ---"))
            batch_processed_ids <- c()

            for (i in 1:nrow(batch_files)) {
              f_id <- batch_files$id[i]
              f_name <- batch_files$name[i]
              local_pdf <- "input.pdf"
              final_output_name <- clean_filename(f_name, f_id)
              download_success <- FALSE

              # --- DOWNLOAD ---
              for (attempt in 1:MAX_RETRIES) {
                if (file.exists(local_pdf)) file.remove(local_pdf)
                tryCatch({
                  drive_download(as_id(f_id), path = local_pdf, overwrite = TRUE, verbose = FALSE)
                  if (file.exists(local_pdf) && file.info(local_pdf)$size > 0) {
                    download_success <- TRUE
                    break
                  }
                }, error = function(e) {
                  if (attempt < MAX_RETRIES) Sys.sleep(2 * attempt)
                })
              }

              # --- PROCESS ---
              if (download_success) {
                tryCatch({
                  resp <- POST(
                    url = GROBID_URL,
                    body = list(input = upload_file(local_pdf)),
                    encode = "multipart"
                  )

                  if (status_code(resp) == 200) {
                     xml_content <- content(resp, "text", encoding = "UTF-8")
                     if (nchar(xml_content) > 100) {
                        writeLines(xml_content, final_output_name)
                        batch_processed_ids <- c(batch_processed_ids, f_id)
                        cat(".")
                     } else {
                        message(paste("\nEmpty Result:", f_name))
                     }
                  } else {
                     message(paste("\nGROBID Fail:", f_name))
                  }
                }, error = function(e) {
                  message(paste("\nScript Error:", f_name))
                })
              } else {
                 message(paste("\nDownload Fail:", f_name))
              }

              if (file.exists(local_pdf)) file.remove(local_pdf)
            }

            # --- COMMIT ---
            if (length(batch_processed_ids) > 0) {
              current_processed <- if(file.exists(JSON_FILE)) fromJSON(JSON_FILE) else character(0)
              updated_processed <- unique(c(current_processed, batch_processed_ids))
              write_json(updated_processed, JSON_FILE)

              system("git add tei/ processed.json")
              system(paste0("git commit -m 'Processed batch: ", length(batch_processed_ids), " files'"))
              system("git push")
              total_processed_session <- total_processed_session + length(batch_processed_ids)
            }
            current_idx <- end_idx + 1
          }

      # Cleanup the secret file
      - name: Cleanup Keys
        if: always()
        run: rm -f service_account.json

## search_pilot_keywords.R
#!/usr/bin/env Rscript

suppressPackageStartupMessages({
  library(dplyr)
  library(purrr)
  library(stringr)
  library(tibble)
  library(tidyr)
  library(metacheck)
})

`%||%` <- function(x, y) if (is.null(x) || length(x) == 0 || all(is.na(x))) y else x

parse_args <- function(args) {
  defaults <- list(
    tei_dir = "tei",
    out_csv = "pilot_keyword_screening.csv",
    out_md = "pilot_keyword_screening_report.md",
    max_files = NA_integer_
  )

  if (length(args) == 0) return(defaults)

  for (arg in args) {
    if (!str_detect(arg, "^--")) next
    kv <- str_split_fixed(sub("^--", "", arg), "=", 2)
    key <- kv[, 1]
    val <- kv[, 2]
    if (val == "") next

    if (key == "tei-dir") defaults$tei_dir <- val
    if (key == "out-csv") defaults$out_csv <- val
    if (key == "out-md") defaults$out_md <- val
    if (key == "max-files") defaults$max_files <- as.integer(val)
  }

  defaults
}

list_tei_files <- function(dir = "tei") {
  list.files(dir, pattern = "\\.tei\\.xml$", full.names = TRUE)
}

extract_doi_from_filename <- function(path) {
  stem <- basename(path) |>
    str_remove("\\.tei\\.xml$") |>
    str_remove("\\.xml$") |>
    utils::URLdecode()

  lower <- str_to_lower(stem)
  prefix <- str_split_fixed(lower, "_-_", 2)[, 1]

  # Expected filename DOI encoding uses '-' or '_' where DOI has '/'.
  prefix <- str_replace(prefix, "^(10\\.[0-9]{4,9})_", "\\1/")
  prefix <- str_replace(prefix, "^(10\\.[0-9]{4,9})-", "\\1/")

  doi <- str_extract(prefix, "10\\.[0-9]{4,9}/[-._;()/:a-z0-9]+")
  doi <- str_replace(doi %||% "", "[\\.,;:]+$", "")

  if (nchar(doi) == 0) {
    return(NA_character_)
  }

  doi
}

collect_full_text <- function(papers) {
  paper_list <- if (inherits(papers, "scivrs_paperlist")) {
    unname(papers)
  } else {
    list(papers)
  }

  map_dfr(paper_list, function(p) {
    ft <- p$full_text
    if (nrow(ft) == 0) {
      return(tibble())
    }
    ft$file <- normalizePath(p$info$filename %||% p$id, winslash = "/", mustWork = FALSE)
    ft
  })
}

build_markdown_report <- function(
  out_md,
  tei_dir,
  result,
  keyword_summary,
  section_summary,
  keyword_section_summary,
  unresolved_doi
) {
  total <- nrow(result)
  included <- sum(result$include)
  excluded <- total - included

  fmt_table <- function(df) {
    if (nrow(df) == 0) {
      return(c("_No hits found._", ""))
    }

    header <- paste0("| ", paste(names(df), collapse = " | "), " |")
    sep <- paste0("|", paste(rep("---", ncol(df)), collapse = "|"), "|")
    rows <- apply(df, 1, function(x) paste0("| ", paste(as.character(x), collapse = " | "), " |"))
    c(header, sep, rows, "")
  }

  lines <- c(
    "# Pilot Keyword Screening Report",
    "",
    paste0("Generated: ", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z")),
    paste0("Input directory: `", tei_dir, "`"),
    "",
    "## Summary",
    "",
    paste0("- Total documents: ", total),
    paste0("- Included (>=1 hit): ", included),
    paste0("- Excluded (0 hits): ", excluded),
    paste0("- Missing DOI parsed from filename: ", unresolved_doi),
    "",
    "## Hits Per Keyword",
    ""
  )

  lines <- c(lines, fmt_table(keyword_summary), "## Hits Per Section", "")
  lines <- c(lines, fmt_table(section_summary), "## Hits Per Keyword And Section", "")
  lines <- c(lines, fmt_table(keyword_section_summary))

  writeLines(lines, con = out_md)
}

main <- function() {
  args <- parse_args(commandArgs(trailingOnly = TRUE))

  tei_files <- list_tei_files(args$tei_dir)
  if (!is.na(args$max_files) && args$max_files > 0) {
    tei_files <- head(sort(tei_files), args$max_files)
  }
  if (length(tei_files) == 0) {
    stop("No .tei.xml files found in: ", args$tei_dir)
  }

  message("Reading TEI files with metacheck: ", length(tei_files), " files")
  papers <- metacheck::read(tei_files)
  full_text <- collect_full_text(papers)

  file_map <- tibble(
    file = normalizePath(tei_files, winslash = "/", mustWork = FALSE),
    doi = map_chr(tei_files, extract_doi_from_filename)
  ) |>
    mutate(doc_key = coalesce(doi, basename(file)))

  full_text <- full_text |>
    left_join(file_map |> select(file, doi, doc_key), by = "file")

  keywords <- c("pilot", "feasibility study", "preliminary study")
  keyword_re <- regex(str_c(keywords, collapse = "|"), ignore_case = TRUE)

  hits <- full_text |>
    filter(section != "references" | is.na(section)) |>
    filter(str_detect(text, keyword_re)) |>
    transmute(
      doc_key,
      doi,
      section = coalesce(section, "unknown"),
      keyword = str_extract_all(str_to_lower(text), keyword_re)
    ) |>
    unnest_longer(keyword, keep_empty = FALSE)

  per_doc_hits <- hits |>
    group_by(doc_key, doi) |>
    summarise(
      keywords_found = str_c(sort(unique(keyword)), collapse = "; "),
      sections_found = str_c(sort(unique(section)), collapse = "; "),
      include = TRUE,
      .groups = "drop"
    )

  result <- file_map |>
    select(doc_key, doi) |>
    distinct() |>
    left_join(per_doc_hits, by = c("doc_key", "doi")) |>
    mutate(
      keywords_found = coalesce(keywords_found, ""),
      sections_found = coalesce(sections_found, ""),
      include = coalesce(include, FALSE)
    ) |>
    select(doi, keywords_found, sections_found, include) |>
    arrange(desc(include), doi)

  keyword_summary <- hits |>
    group_by(keyword) |>
    summarise(
      mentions = n(),
      documents = n_distinct(doc_key),
      .groups = "drop"
    ) |>
    arrange(desc(documents), desc(mentions), keyword)

  section_summary <- hits |>
    group_by(section) |>
    summarise(
      mentions = n(),
      documents = n_distinct(doc_key),
      .groups = "drop"
    ) |>
    arrange(desc(documents), desc(mentions), section)

  keyword_section_summary <- hits |>
    distinct(doc_key, keyword, section) |>
    count(keyword, section, name = "documents") |>
    arrange(desc(documents), keyword, section)

  unresolved_doi <- sum(is.na(file_map$doi))

  utils::write.csv(result, args$out_csv, row.names = FALSE, na = "")
  build_markdown_report(
    out_md = args$out_md,
    tei_dir = args$tei_dir,
    result = result,
    keyword_summary = keyword_summary,
    section_summary = section_summary,
    keyword_section_summary = keyword_section_summary,
    unresolved_doi = unresolved_doi
  )

  message("Wrote CSV: ", args$out_csv)
  message("Wrote report: ", args$out_md)
}

main()
	name: Continuous GROBID Conversion (Authenticated)

	on:
	workflow_dispatch: {}
	#schedule:
	# - cron: "0 /6 * *"

	permissions:
	contents: write

	jobs:
	grobid-service-account:
	runs-on: ubuntu-latest
	steps:
	- name: Free disk space
	run: \|
	sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc \
	/opt/hostedtoolcache/CodeQL /usr/local/share/boost \
	"$AGENT_TOOLSDIRECTORY"
	docker rmi $(docker images -aq) 2>/dev/null \|\| true

	- name: Start GROBID
	run: \|
	docker run -d --name grobid -p 8070:8070 grobid/grobid:0.8.2.1-full
	timeout 600s bash -c '
	until curl -s localhost:8070/api/isalive > /dev/null; do
	echo "Waiting for GROBID..."
	sleep 5
	done'

	- name: Checkout repo
	uses: actions/checkout@v4

	- name: Setup R with r2u
	uses: eddelbuettel/github-actions/r2u-setup@master

	- name: Install R Packages
	run: Rscript -e 'install.packages(c("googledrive", "jsonlite", "httr", "tools"))'

	# --- NEW STEP: Write the Secret to a file for R to use ---
	- name: Create Service Account File
	run: echo '${{ secrets.GDRIVE_SA_JSON }}' > service_account.json

	- name: Ensure directories exist
	run: mkdir -p tei

	- name: Run Processing Loop (Authenticated)
	shell: Rscript {0}
	run: \|
	library(googledrive)
	library(jsonlite)
	library(httr)
	library(tools)

	# --- CONFIGURATION ---
	FOLDER_ID <- "18vChmJn4Q2biMmTbRWWgwkQd8KE002JR"
	GROBID_URL <- "http://localhost:8070/api/processFulltextDocument"
	JSON_FILE <- "processed.json"
	# We can be more aggressive with batch size now that we are authenticated
	BATCH_SIZE <- 50
	TIME_LIMIT <- 5 * 60 * 60
	MAX_RETRIES <- 3

	system('git config user.name "github-actions"')
	system('git config user.email "actions@github.com"')

	# --- AUTHENTICATION (The fix) ---
	# R reads the file we created in the previous YAML step
	tryCatch({
	drive_auth(path = "service_account.json")
	message("Authentication successful.")
	}, error = function(e) {
	message("CRITICAL: Authentication failed.")
	quit(status = 1)
	})

	# --- HELPER: SANITIZE FILENAME ---
	clean_filename <- function(name, id) {
	base <- file_path_sans_ext(name)
	safe_base <- gsub("[^[:alnum:]\\.-]", "_", base)
	candidate <- paste0("tei/", safe_base, ".tei.xml")
	if (file.exists(candidate)) {
	return(paste0("tei/", safe_base, "_", id, ".tei.xml"))
	}
	return(candidate)
	}

	# --- INITIALIZATION ---
	start_time <- Sys.time()

	if (file.exists(JSON_FILE)) {
	processed_ids <- fromJSON(JSON_FILE)
	} else {
	processed_ids <- character(0)
	}

	message("Fetching file list...")
	all_files <- drive_ls(as_id(FOLDER_ID), pattern = "\\.pdf$")

	todo <- all_files[!(all_files$id %in% processed_ids), ]
	total_todo <- nrow(todo)

	if (total_todo == 0) {
	message("All caught up!")
	quit(status = 0)
	}
	message(paste("Queue size:", total_todo))

	# --- LOOP ---
	current_idx <- 1
	total_processed_session <- 0

	while(current_idx <= total_todo) {

	if (as.numeric(difftime(Sys.time(), start_time, units="secs")) > TIME_LIMIT) break

	end_idx <- min(current_idx + BATCH_SIZE - 1, total_todo)
	batch_files <- todo[current_idx:end_idx, ]

	message(paste0("\n--- Starting Batch: ", current_idx, " to ", end_idx, " ---"))
	batch_processed_ids <- c()

	for (i in 1:nrow(batch_files)) {
	f_id <- batch_files$id[i]
	f_name <- batch_files$name[i]
	local_pdf <- "input.pdf"
	final_output_name <- clean_filename(f_name, f_id)
	download_success <- FALSE

	# --- DOWNLOAD ---
	for (attempt in 1:MAX_RETRIES) {
	if (file.exists(local_pdf)) file.remove(local_pdf)
	tryCatch({
	drive_download(as_id(f_id), path = local_pdf, overwrite = TRUE, verbose = FALSE)
	if (file.exists(local_pdf) && file.info(local_pdf)$size > 0) {
	download_success <- TRUE
	break
	}
	}, error = function(e) {
	if (attempt < MAX_RETRIES) Sys.sleep(2 * attempt)
	})
	}

	# --- PROCESS ---
	if (download_success) {
	tryCatch({
	resp <- POST(
	url = GROBID_URL,
	body = list(input = upload_file(local_pdf)),
	encode = "multipart"
	)

	if (status_code(resp) == 200) {
	xml_content <- content(resp, "text", encoding = "UTF-8")
	if (nchar(xml_content) > 100) {
	writeLines(xml_content, final_output_name)
	batch_processed_ids <- c(batch_processed_ids, f_id)
	cat(".")
	} else {
	message(paste("\nEmpty Result:", f_name))
	}
	} else {
	message(paste("\nGROBID Fail:", f_name))
	}
	}, error = function(e) {
	message(paste("\nScript Error:", f_name))
	})
	} else {
	message(paste("\nDownload Fail:", f_name))
	}

	if (file.exists(local_pdf)) file.remove(local_pdf)
	}

	# --- COMMIT ---
	if (length(batch_processed_ids) > 0) {
	current_processed <- if(file.exists(JSON_FILE)) fromJSON(JSON_FILE) else character(0)
	updated_processed <- unique(c(current_processed, batch_processed_ids))
	write_json(updated_processed, JSON_FILE)

	system("git add tei/ processed.json")
	system(paste0("git commit -m 'Processed batch: ", length(batch_processed_ids), " files'"))
	system("git push")
	total_processed_session <- total_processed_session + length(batch_processed_ids)
	}
	current_idx <- end_idx + 1
	}

	# Cleanup the secret file
	- name: Cleanup Keys
	if: always()
	run: rm -f service_account.json
	#!/usr/bin/env Rscript

	suppressPackageStartupMessages({
	library(dplyr)
	library(purrr)
	library(stringr)
	library(tibble)
	library(tidyr)
	library(metacheck)
	})

	`%\|\|%` <- function(x, y) if (is.null(x) \|\| length(x) == 0 \|\| all(is.na(x))) y else x

	parse_args <- function(args) {
	defaults <- list(
	tei_dir = "tei",
	out_csv = "pilot_keyword_screening.csv",
	out_md = "pilot_keyword_screening_report.md",
	max_files = NA_integer_
	)

	if (length(args) == 0) return(defaults)

	for (arg in args) {
	if (!str_detect(arg, "^--")) next
	kv <- str_split_fixed(sub("^--", "", arg), "=", 2)
	key <- kv[, 1]
	val <- kv[, 2]
	if (val == "") next

	if (key == "tei-dir") defaults$tei_dir <- val
	if (key == "out-csv") defaults$out_csv <- val
	if (key == "out-md") defaults$out_md <- val
	if (key == "max-files") defaults$max_files <- as.integer(val)
	}

	defaults
	}

	list_tei_files <- function(dir = "tei") {
	list.files(dir, pattern = "\\.tei\\.xml$", full.names = TRUE)
	}

	extract_doi_from_filename <- function(path) {
	stem <- basename(path) \|>
	str_remove("\\.tei\\.xml$") \|>
	str_remove("\\.xml$") \|>
	utils::URLdecode()

	lower <- str_to_lower(stem)
	prefix <- str_split_fixed(lower, "_-_", 2)[, 1]

	# Expected filename DOI encoding uses '-' or '_' where DOI has '/'.
	prefix <- str_replace(prefix, "^(10\\.[0-9]{4,9})_", "\\1/")
	prefix <- str_replace(prefix, "^(10\\.[0-9]{4,9})-", "\\1/")

	doi <- str_extract(prefix, "10\\.[0-9]{4,9}/[-._;()/:a-z0-9]+")
	doi <- str_replace(doi %\|\|% "", "[\\.,;:]+$", "")

	if (nchar(doi) == 0) {
	return(NA_character_)
	}

	doi
	}

	collect_full_text <- function(papers) {
	paper_list <- if (inherits(papers, "scivrs_paperlist")) {
	unname(papers)
	} else {
	list(papers)
	}

	map_dfr(paper_list, function(p) {
	ft <- p$full_text
	if (nrow(ft) == 0) {
	return(tibble())
	}
	ft$file <- normalizePath(p$info$filename %\|\|% p$id, winslash = "/", mustWork = FALSE)
	ft
	})
	}

	build_markdown_report <- function(
	out_md,
	tei_dir,
	result,
	keyword_summary,
	section_summary,
	keyword_section_summary,
	unresolved_doi
	) {
	total <- nrow(result)
	included <- sum(result$include)
	excluded <- total - included

	fmt_table <- function(df) {
	if (nrow(df) == 0) {
	return(c("_No hits found._", ""))
	}

	header <- paste0("\| ", paste(names(df), collapse = " \| "), " \|")
	sep <- paste0("\|", paste(rep("---", ncol(df)), collapse = "\|"), "\|")
	rows <- apply(df, 1, function(x) paste0("\| ", paste(as.character(x), collapse = " \| "), " \|"))
	c(header, sep, rows, "")
	}

	lines <- c(
	"# Pilot Keyword Screening Report",
	"",
	paste0("Generated: ", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z")),
	paste0("Input directory: `", tei_dir, "`"),
	"",
	"## Summary",
	"",
	paste0("- Total documents: ", total),
	paste0("- Included (>=1 hit): ", included),
	paste0("- Excluded (0 hits): ", excluded),
	paste0("- Missing DOI parsed from filename: ", unresolved_doi),
	"",
	"## Hits Per Keyword",
	""
	)

	lines <- c(lines, fmt_table(keyword_summary), "## Hits Per Section", "")
	lines <- c(lines, fmt_table(section_summary), "## Hits Per Keyword And Section", "")
	lines <- c(lines, fmt_table(keyword_section_summary))

	writeLines(lines, con = out_md)
	}

	main <- function() {
	args <- parse_args(commandArgs(trailingOnly = TRUE))

	tei_files <- list_tei_files(args$tei_dir)
	if (!is.na(args$max_files) && args$max_files > 0) {
	tei_files <- head(sort(tei_files), args$max_files)
	}
	if (length(tei_files) == 0) {
	stop("No .tei.xml files found in: ", args$tei_dir)
	}

	message("Reading TEI files with metacheck: ", length(tei_files), " files")
	papers <- metacheck::read(tei_files)
	full_text <- collect_full_text(papers)

	file_map <- tibble(
	file = normalizePath(tei_files, winslash = "/", mustWork = FALSE),
	doi = map_chr(tei_files, extract_doi_from_filename)
	) \|>
	mutate(doc_key = coalesce(doi, basename(file)))

	full_text <- full_text \|>
	left_join(file_map \|> select(file, doi, doc_key), by = "file")

	keywords <- c("pilot", "feasibility study", "preliminary study")
	keyword_re <- regex(str_c(keywords, collapse = "\|"), ignore_case = TRUE)

	hits <- full_text \|>
	filter(section != "references" \| is.na(section)) \|>
	filter(str_detect(text, keyword_re)) \|>
	transmute(
	doc_key,
	doi,
	section = coalesce(section, "unknown"),
	keyword = str_extract_all(str_to_lower(text), keyword_re)
	) \|>
	unnest_longer(keyword, keep_empty = FALSE)

	per_doc_hits <- hits \|>
	group_by(doc_key, doi) \|>
	summarise(
	keywords_found = str_c(sort(unique(keyword)), collapse = "; "),
	sections_found = str_c(sort(unique(section)), collapse = "; "),
	include = TRUE,
	.groups = "drop"
	)

	result <- file_map \|>
	select(doc_key, doi) \|>
	distinct() \|>
	left_join(per_doc_hits, by = c("doc_key", "doi")) \|>
	mutate(
	keywords_found = coalesce(keywords_found, ""),
	sections_found = coalesce(sections_found, ""),
	include = coalesce(include, FALSE)
	) \|>
	select(doi, keywords_found, sections_found, include) \|>
	arrange(desc(include), doi)

	keyword_summary <- hits \|>
	group_by(keyword) \|>
	summarise(
	mentions = n(),
	documents = n_distinct(doc_key),
	.groups = "drop"
	) \|>
	arrange(desc(documents), desc(mentions), keyword)

	section_summary <- hits \|>
	group_by(section) \|>
	summarise(
	mentions = n(),
	documents = n_distinct(doc_key),
	.groups = "drop"
	) \|>
	arrange(desc(documents), desc(mentions), section)

	keyword_section_summary <- hits \|>
	distinct(doc_key, keyword, section) \|>
	count(keyword, section, name = "documents") \|>
	arrange(desc(documents), keyword, section)

	unresolved_doi <- sum(is.na(file_map$doi))

	utils::write.csv(result, args$out_csv, row.names = FALSE, na = "")
	build_markdown_report(
	out_md = args$out_md,
	tei_dir = args$tei_dir,
	result = result,
	keyword_summary = keyword_summary,
	section_summary = section_summary,
	keyword_section_summary = keyword_section_summary,
	unresolved_doi = unresolved_doi
	)

	message("Wrote CSV: ", args$out_csv)
	message("Wrote report: ", args$out_md)
	}

	main()