Skip to content

Instantly share code, notes, and snippets.

@SwampThingPaul
Created November 12, 2025 14:49
Show Gist options
  • Select an option

  • Save SwampThingPaul/4365e5b1ce9e8fb17ff8c1f30f3e91dc to your computer and use it in GitHub Desktop.

Select an option

Save SwampThingPaul/4365e5b1ce9e8fb17ff8c1f30f3e91dc to your computer and use it in GitHub Desktop.
Sampling inventory interrogation
## Function
# Detect sampling frequency for one vector of dates (Date or character)
detect_sampling_freq_base <- function(dates, pct_threshold = 0.6, day_tol = 2) {
dates <- sort(as.Date(dates))
if (length(dates) < 2) return(list(label = "insufficient", median_diff = NA, mode_diff = NA, pct = NA))
diffs_days <- as.numeric(diff(dates)) # consecutive differences in days
# month differences (integer months between consecutive dates)
month_index <- function(d) as.integer(format(d, "%Y"))*12 + as.integer(format(d, "%m"))
diffs_months <- diff(month_index(dates))
median_diff <- median(diffs_days, na.rm = TRUE)
# mode of diffs (days)
mode_diff <- as.numeric(names(sort(table(diffs_days), decreasing = TRUE)[1]))
# candidate checks
is_weekly <- abs(diffs_days - 7) <= day_tol
is_fortnight <- abs(diffs_days - 14) <= day_tol
is_month_days <- abs(diffs_days - 30) <= 3 | diffs_months == 1
is_quarter_months <- diffs_months == 3 | abs(diffs_days - 91) <= 7
is_annual_months <- diffs_months == 12 | abs(diffs_days - 365) <= 15
# compute proportions
p_weekly <- mean(is_weekly, na.rm = TRUE)
p_fort <- mean(is_fortnight, na.rm = TRUE)
p_month <- mean(is_month_days, na.rm = TRUE)
p_quarter <- mean(is_quarter_months, na.rm = TRUE)
p_annual <- mean(is_annual_months, na.rm = TRUE)
props <- c(weekly = p_weekly, fortnight = p_fort, monthly = p_month, quarterly = p_quarter, annual = p_annual)
best <- names(which.max(props))
best_pct <- max(props)
label <- if (best_pct >= pct_threshold) best else "mixed/irregular"
list(
label = label,
median_diff_days = median_diff,
mode_diff_days = mode_diff,
best_pct = best_pct,
props = props,
n_intervals = length(diffs_days)
)
}
# -------------------------------------------------------------------------
library(ggplot2)
library(viridisLite)
# Example data.frame
some_data <- data.frame(
station = ...,
network = ...,
Param = ...,
Samp.Date = ...,
WY = ...,
value = ...
)
## data inventory
inv_df <- aggregate(value ~ station + network + Param + WY,some_data,FUN = length)
names(inv_df)[5] <- "n_samples"
data_inv <- ggplot(inv_df, aes(x = WY, y = station, fill = n_samples)) +
geom_tile(color = "grey90") +
scale_fill_viridis_c(option = "C", trans = "sqrt") +
labs(
title = paste("Data Inventory (grab sample only)"),
x = "Water Year",
y = "Site",
fill = "# Samples"
)+
facet_grid(network~Param,scales = "free_y",space = "free_y")
## sampling internval
groups <- split(df, paste(some_data$station, some_data$Param, sep = "||"))
results <- lapply(names(groups), function(k) {
sub <- groups[[k]]
parts <- strsplit(k, "\\|\\|")[[1]]
res <- detect_sampling_freq_base(sub$Samp.Date)
data.frame(
station = parts[1],
parameter = parts[2],
freq_label = res$label,
median_diff_days = res$median_diff_days,
mode_diff_days = res$mode_diff_days,
best_pct = res$best_pct,
n_intervals = res$n_intervals,
stringsAsFactors = FALSE
)
})
freq_df <- do.call(rbind, results)
freq_df <- merge(freq_df,sites_locs,"station")
freq_df$freq_label <- factor(freq_df$freq_label,
levels = c("mixed/irregular","quarterly",
"monthly","fortnight", "weekly"))
bar_freq_netparam <- ggplot(freq_df, aes(x = freq_label, fill = freq_label)) +
geom_bar() +labs(
title = "Sampling Frequency by Parameter",
x = "Sampling Frequency",
y = "Number of Stations",
fill = "Inferred Frequency"
) + ylim(0,50) +
facet_grid(network~parameter,scales = "free_y",space = "free_y")+
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
)
## Sampling Cadence change
# Sort data by station and date
some_data <- some_data[order(some_data$station, some_data$Samp.Date), ]
# Split by station and year
spl <- split(some_data, list(some_data$station, some_data$Param, some_data$WY), drop = TRUE, sep = "_|_")
# Compute median sampling interval for each group
median_intervals <- sapply(spl, function(x) {
if (nrow(x) > 1) median(diff(x$Samp.Date)) else NA
})
# Split names into separate columns
split_names <- do.call(rbind, strsplit(names(median_intervals), "_\\|_"))
freq_df2 <- data.frame(
station = split_names[, 1],
Param = split_names[, 2],
WY = split_names[, 3],
median_diff_days = median_intervals
)
freq_df2 <- freq_df2[!is.na(freq_df2$median_diff_days), ]
freq_df2 <- freq_df2[order(freq_df2$station, freq_df2$Param, freq_df2$WY), ]
freq_df2$change_flag <- ave(
freq_df2$median_diff_days,
interaction(freq_df2$station, freq_df2$Param),
FUN = function(x) c(NA, abs(diff(x)) > 5)
)
freq_df2$change_flag <- factor(
ifelse(is.na(freq_df2$change_flag), "No data",
ifelse(freq_df2$change_flag == 1, "Change", "Stable")),
levels = c("No data", "Stable", "Change")
)
freq_df2$cadence_group <- cut(
freq_df2$median_diff_days,
breaks = c(-Inf, 7, 14, 30, 90, Inf),
labels = c("<7", "7–14", "14–30", "30–90", ">90"),
include.lowest = TRUE
)
samp_interval <- ggplot(freq_df2, aes(x = WY, y = station, fill = cadence_group)) +
geom_tile(color = "grey80") +
facet_grid(network~ Param,scales = "free_y",space = "free_y") +
scale_fill_viridis_d(option = "C", direction = -1, name = "Sampling Frequency") +
scale_x_discrete(
breaks = seq(1970, max(freq_df2$WY), by = 10) # every 5 years
) +
labs(
title = "Sampling Interval by Station and Parameter",
x = "Water Year",
y = "Station"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment