Created
November 12, 2025 14:49
-
-
Save SwampThingPaul/4365e5b1ce9e8fb17ff8c1f30f3e91dc to your computer and use it in GitHub Desktop.
Sampling inventory interrogation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## Function | |
| # Detect sampling frequency for one vector of dates (Date or character) | |
| detect_sampling_freq_base <- function(dates, pct_threshold = 0.6, day_tol = 2) { | |
| dates <- sort(as.Date(dates)) | |
| if (length(dates) < 2) return(list(label = "insufficient", median_diff = NA, mode_diff = NA, pct = NA)) | |
| diffs_days <- as.numeric(diff(dates)) # consecutive differences in days | |
| # month differences (integer months between consecutive dates) | |
| month_index <- function(d) as.integer(format(d, "%Y"))*12 + as.integer(format(d, "%m")) | |
| diffs_months <- diff(month_index(dates)) | |
| median_diff <- median(diffs_days, na.rm = TRUE) | |
| # mode of diffs (days) | |
| mode_diff <- as.numeric(names(sort(table(diffs_days), decreasing = TRUE)[1])) | |
| # candidate checks | |
| is_weekly <- abs(diffs_days - 7) <= day_tol | |
| is_fortnight <- abs(diffs_days - 14) <= day_tol | |
| is_month_days <- abs(diffs_days - 30) <= 3 | diffs_months == 1 | |
| is_quarter_months <- diffs_months == 3 | abs(diffs_days - 91) <= 7 | |
| is_annual_months <- diffs_months == 12 | abs(diffs_days - 365) <= 15 | |
| # compute proportions | |
| p_weekly <- mean(is_weekly, na.rm = TRUE) | |
| p_fort <- mean(is_fortnight, na.rm = TRUE) | |
| p_month <- mean(is_month_days, na.rm = TRUE) | |
| p_quarter <- mean(is_quarter_months, na.rm = TRUE) | |
| p_annual <- mean(is_annual_months, na.rm = TRUE) | |
| props <- c(weekly = p_weekly, fortnight = p_fort, monthly = p_month, quarterly = p_quarter, annual = p_annual) | |
| best <- names(which.max(props)) | |
| best_pct <- max(props) | |
| label <- if (best_pct >= pct_threshold) best else "mixed/irregular" | |
| list( | |
| label = label, | |
| median_diff_days = median_diff, | |
| mode_diff_days = mode_diff, | |
| best_pct = best_pct, | |
| props = props, | |
| n_intervals = length(diffs_days) | |
| ) | |
| } | |
| # ------------------------------------------------------------------------- | |
| library(ggplot2) | |
| library(viridisLite) | |
| # Example data.frame | |
| some_data <- data.frame( | |
| station = ..., | |
| network = ..., | |
| Param = ..., | |
| Samp.Date = ..., | |
| WY = ..., | |
| value = ... | |
| ) | |
| ## data inventory | |
| inv_df <- aggregate(value ~ station + network + Param + WY,some_data,FUN = length) | |
| names(inv_df)[5] <- "n_samples" | |
| data_inv <- ggplot(inv_df, aes(x = WY, y = station, fill = n_samples)) + | |
| geom_tile(color = "grey90") + | |
| scale_fill_viridis_c(option = "C", trans = "sqrt") + | |
| labs( | |
| title = paste("Data Inventory (grab sample only)"), | |
| x = "Water Year", | |
| y = "Site", | |
| fill = "# Samples" | |
| )+ | |
| facet_grid(network~Param,scales = "free_y",space = "free_y") | |
| ## sampling internval | |
| groups <- split(df, paste(some_data$station, some_data$Param, sep = "||")) | |
| results <- lapply(names(groups), function(k) { | |
| sub <- groups[[k]] | |
| parts <- strsplit(k, "\\|\\|")[[1]] | |
| res <- detect_sampling_freq_base(sub$Samp.Date) | |
| data.frame( | |
| station = parts[1], | |
| parameter = parts[2], | |
| freq_label = res$label, | |
| median_diff_days = res$median_diff_days, | |
| mode_diff_days = res$mode_diff_days, | |
| best_pct = res$best_pct, | |
| n_intervals = res$n_intervals, | |
| stringsAsFactors = FALSE | |
| ) | |
| }) | |
| freq_df <- do.call(rbind, results) | |
| freq_df <- merge(freq_df,sites_locs,"station") | |
| freq_df$freq_label <- factor(freq_df$freq_label, | |
| levels = c("mixed/irregular","quarterly", | |
| "monthly","fortnight", "weekly")) | |
| bar_freq_netparam <- ggplot(freq_df, aes(x = freq_label, fill = freq_label)) + | |
| geom_bar() +labs( | |
| title = "Sampling Frequency by Parameter", | |
| x = "Sampling Frequency", | |
| y = "Number of Stations", | |
| fill = "Inferred Frequency" | |
| ) + ylim(0,50) + | |
| facet_grid(network~parameter,scales = "free_y",space = "free_y")+ | |
| theme( | |
| axis.text.x = element_text(angle = 45, hjust = 1) | |
| ) | |
| ## Sampling Cadence change | |
| # Sort data by station and date | |
| some_data <- some_data[order(some_data$station, some_data$Samp.Date), ] | |
| # Split by station and year | |
| spl <- split(some_data, list(some_data$station, some_data$Param, some_data$WY), drop = TRUE, sep = "_|_") | |
| # Compute median sampling interval for each group | |
| median_intervals <- sapply(spl, function(x) { | |
| if (nrow(x) > 1) median(diff(x$Samp.Date)) else NA | |
| }) | |
| # Split names into separate columns | |
| split_names <- do.call(rbind, strsplit(names(median_intervals), "_\\|_")) | |
| freq_df2 <- data.frame( | |
| station = split_names[, 1], | |
| Param = split_names[, 2], | |
| WY = split_names[, 3], | |
| median_diff_days = median_intervals | |
| ) | |
| freq_df2 <- freq_df2[!is.na(freq_df2$median_diff_days), ] | |
| freq_df2 <- freq_df2[order(freq_df2$station, freq_df2$Param, freq_df2$WY), ] | |
| freq_df2$change_flag <- ave( | |
| freq_df2$median_diff_days, | |
| interaction(freq_df2$station, freq_df2$Param), | |
| FUN = function(x) c(NA, abs(diff(x)) > 5) | |
| ) | |
| freq_df2$change_flag <- factor( | |
| ifelse(is.na(freq_df2$change_flag), "No data", | |
| ifelse(freq_df2$change_flag == 1, "Change", "Stable")), | |
| levels = c("No data", "Stable", "Change") | |
| ) | |
| freq_df2$cadence_group <- cut( | |
| freq_df2$median_diff_days, | |
| breaks = c(-Inf, 7, 14, 30, 90, Inf), | |
| labels = c("<7", "7–14", "14–30", "30–90", ">90"), | |
| include.lowest = TRUE | |
| ) | |
| samp_interval <- ggplot(freq_df2, aes(x = WY, y = station, fill = cadence_group)) + | |
| geom_tile(color = "grey80") + | |
| facet_grid(network~ Param,scales = "free_y",space = "free_y") + | |
| scale_fill_viridis_d(option = "C", direction = -1, name = "Sampling Frequency") + | |
| scale_x_discrete( | |
| breaks = seq(1970, max(freq_df2$WY), by = 10) # every 5 years | |
| ) + | |
| labs( | |
| title = "Sampling Interval by Station and Parameter", | |
| x = "Water Year", | |
| y = "Station" | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment