SwampThingPaul/samp_inv.R

## samp_inv.R
## Function
# Detect sampling frequency for one vector of dates (Date or character)
detect_sampling_freq_base <- function(dates, pct_threshold = 0.6, day_tol = 2) {
  dates <- sort(as.Date(dates))
  if (length(dates) < 2) return(list(label = "insufficient", median_diff = NA, mode_diff = NA, pct = NA))

  diffs_days <- as.numeric(diff(dates))    # consecutive differences in days

  # month differences (integer months between consecutive dates)
  month_index <- function(d) as.integer(format(d, "%Y"))*12 + as.integer(format(d, "%m"))
  diffs_months <- diff(month_index(dates))

  median_diff <- median(diffs_days, na.rm = TRUE)
  # mode of diffs (days)
  mode_diff <- as.numeric(names(sort(table(diffs_days), decreasing = TRUE)[1]))

  # candidate checks
  is_weekly <- abs(diffs_days - 7) <= day_tol
  is_fortnight <- abs(diffs_days - 14) <= day_tol
  is_month_days <- abs(diffs_days - 30) <= 3 | diffs_months == 1
  is_quarter_months <- diffs_months == 3 | abs(diffs_days - 91) <= 7
  is_annual_months <- diffs_months == 12 | abs(diffs_days - 365) <= 15

  # compute proportions
  p_weekly <- mean(is_weekly, na.rm = TRUE)
  p_fort <- mean(is_fortnight, na.rm = TRUE)
  p_month <- mean(is_month_days, na.rm = TRUE)
  p_quarter <- mean(is_quarter_months, na.rm = TRUE)
  p_annual <- mean(is_annual_months, na.rm = TRUE)

  props <- c(weekly = p_weekly, fortnight = p_fort, monthly = p_month, quarterly = p_quarter, annual = p_annual)
  best <- names(which.max(props))
  best_pct <- max(props)

  label <- if (best_pct >= pct_threshold) best else "mixed/irregular"

  list(
    label = label,
    median_diff_days = median_diff,
    mode_diff_days = mode_diff,
    best_pct = best_pct,
    props = props,
    n_intervals = length(diffs_days)
  )
}
# -------------------------------------------------------------------------
library(ggplot2)
library(viridisLite)

# Example data.frame
some_data <- data.frame(
  station = ...,
  network = ...,
  Param = ...,
  Samp.Date = ...,
  WY = ...,
  value = ...
)

## data inventory
inv_df <- aggregate(value ~ station + network + Param + WY,some_data,FUN = length)
names(inv_df)[5] <- "n_samples"

data_inv <- ggplot(inv_df, aes(x = WY, y = station, fill = n_samples)) +
  geom_tile(color = "grey90") +
  scale_fill_viridis_c(option = "C", trans = "sqrt") +
  labs(
    title = paste("Data Inventory (grab sample only)"),
    x = "Water Year",
    y = "Site",
    fill = "# Samples"
  )+
  facet_grid(network~Param,scales = "free_y",space = "free_y")

## sampling internval
groups <- split(df, paste(some_data$station, some_data$Param, sep = "||"))

results <- lapply(names(groups), function(k) {
  sub <- groups[[k]]
  parts <- strsplit(k, "\\|\\|")[[1]]
  res <- detect_sampling_freq_base(sub$Samp.Date)
  data.frame(
    station = parts[1],
    parameter = parts[2],
    freq_label = res$label,
    median_diff_days = res$median_diff_days,
    mode_diff_days = res$mode_diff_days,
    best_pct = res$best_pct,
    n_intervals = res$n_intervals,
    stringsAsFactors = FALSE
  )
})
freq_df <- do.call(rbind, results)
freq_df <- merge(freq_df,sites_locs,"station")

freq_df$freq_label <- factor(freq_df$freq_label,
                             levels = c("mixed/irregular","quarterly",
                                        "monthly","fortnight", "weekly"))

bar_freq_netparam <- ggplot(freq_df, aes(x = freq_label, fill = freq_label)) +
  geom_bar() +labs(
    title = "Sampling Frequency by Parameter",
    x = "Sampling Frequency",
    y = "Number of Stations",
    fill = "Inferred Frequency"
  ) + ylim(0,50) +
  facet_grid(network~parameter,scales = "free_y",space = "free_y")+
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

## Sampling Cadence change
# Sort data by station and date
some_data <- some_data[order(some_data$station, some_data$Samp.Date), ]

# Split by station and year
spl <- split(some_data, list(some_data$station, some_data$Param, some_data$WY), drop = TRUE, sep = "_|_")

# Compute median sampling interval for each group
median_intervals <- sapply(spl, function(x) {
  if (nrow(x) > 1) median(diff(x$Samp.Date)) else NA
})

# Split names into separate columns
split_names <-  do.call(rbind, strsplit(names(median_intervals), "_\\|_"))

freq_df2 <- data.frame(
  station = split_names[, 1],
  Param   = split_names[, 2],
  WY      = split_names[, 3],
  median_diff_days = median_intervals
)
freq_df2 <- freq_df2[!is.na(freq_df2$median_diff_days), ]
freq_df2 <- freq_df2[order(freq_df2$station, freq_df2$Param, freq_df2$WY), ]

freq_df2$change_flag <- ave(
  freq_df2$median_diff_days,
  interaction(freq_df2$station, freq_df2$Param),
  FUN = function(x) c(NA, abs(diff(x)) > 5)
)
freq_df2$change_flag <- factor(
  ifelse(is.na(freq_df2$change_flag), "No data",
         ifelse(freq_df2$change_flag == 1, "Change", "Stable")),
  levels = c("No data", "Stable", "Change")
)

freq_df2$cadence_group <- cut(
  freq_df2$median_diff_days,
  breaks = c(-Inf, 7, 14, 30, 90, Inf),
  labels = c("<7", "7–14", "14–30", "30–90", ">90"),
  include.lowest = TRUE
)

samp_interval <- ggplot(freq_df2, aes(x = WY, y = station, fill = cadence_group)) +
  geom_tile(color = "grey80") +
  facet_grid(network~ Param,scales = "free_y",space = "free_y") +
  scale_fill_viridis_d(option = "C", direction = -1, name = "Sampling Frequency") +
  scale_x_discrete(
    breaks = seq(1970, max(freq_df2$WY), by = 10)  # every 5 years
  ) +
  labs(
    title = "Sampling Interval by Station and Parameter",
    x = "Water Year",
    y = "Station"
  )
	## Function
	# Detect sampling frequency for one vector of dates (Date or character)
	detect_sampling_freq_base <- function(dates, pct_threshold = 0.6, day_tol = 2) {
	dates <- sort(as.Date(dates))
	if (length(dates) < 2) return(list(label = "insufficient", median_diff = NA, mode_diff = NA, pct = NA))

	diffs_days <- as.numeric(diff(dates)) # consecutive differences in days

	# month differences (integer months between consecutive dates)
	month_index <- function(d) as.integer(format(d, "%Y"))*12 + as.integer(format(d, "%m"))
	diffs_months <- diff(month_index(dates))

	median_diff <- median(diffs_days, na.rm = TRUE)
	# mode of diffs (days)
	mode_diff <- as.numeric(names(sort(table(diffs_days), decreasing = TRUE)[1]))

	# candidate checks
	is_weekly <- abs(diffs_days - 7) <= day_tol
	is_fortnight <- abs(diffs_days - 14) <= day_tol
	is_month_days <- abs(diffs_days - 30) <= 3 \| diffs_months == 1
	is_quarter_months <- diffs_months == 3 \| abs(diffs_days - 91) <= 7
	is_annual_months <- diffs_months == 12 \| abs(diffs_days - 365) <= 15

	# compute proportions
	p_weekly <- mean(is_weekly, na.rm = TRUE)
	p_fort <- mean(is_fortnight, na.rm = TRUE)
	p_month <- mean(is_month_days, na.rm = TRUE)
	p_quarter <- mean(is_quarter_months, na.rm = TRUE)
	p_annual <- mean(is_annual_months, na.rm = TRUE)

	props <- c(weekly = p_weekly, fortnight = p_fort, monthly = p_month, quarterly = p_quarter, annual = p_annual)
	best <- names(which.max(props))
	best_pct <- max(props)

	label <- if (best_pct >= pct_threshold) best else "mixed/irregular"

	list(
	label = label,
	median_diff_days = median_diff,
	mode_diff_days = mode_diff,
	best_pct = best_pct,
	props = props,
	n_intervals = length(diffs_days)
	)
	}
	# -------------------------------------------------------------------------
	library(ggplot2)
	library(viridisLite)

	# Example data.frame
	some_data <- data.frame(
	station = ...,
	network = ...,
	Param = ...,
	Samp.Date = ...,
	WY = ...,
	value = ...
	)

	## data inventory
	inv_df <- aggregate(value ~ station + network + Param + WY,some_data,FUN = length)
	names(inv_df)[5] <- "n_samples"

	data_inv <- ggplot(inv_df, aes(x = WY, y = station, fill = n_samples)) +
	geom_tile(color = "grey90") +
	scale_fill_viridis_c(option = "C", trans = "sqrt") +
	labs(
	title = paste("Data Inventory (grab sample only)"),
	x = "Water Year",
	y = "Site",
	fill = "# Samples"
	)+
	facet_grid(network~Param,scales = "free_y",space = "free_y")

	## sampling internval
	groups <- split(df, paste(some_data$station, some_data$Param, sep = "\|\|"))

	results <- lapply(names(groups), function(k) {
	sub <- groups[[k]]
	parts <- strsplit(k, "\\\|\\\|")[[1]]
	res <- detect_sampling_freq_base(sub$Samp.Date)
	data.frame(
	station = parts[1],
	parameter = parts[2],
	freq_label = res$label,
	median_diff_days = res$median_diff_days,
	mode_diff_days = res$mode_diff_days,
	best_pct = res$best_pct,
	n_intervals = res$n_intervals,
	stringsAsFactors = FALSE
	)
	})
	freq_df <- do.call(rbind, results)
	freq_df <- merge(freq_df,sites_locs,"station")

	freq_df$freq_label <- factor(freq_df$freq_label,
	levels = c("mixed/irregular","quarterly",
	"monthly","fortnight", "weekly"))

	bar_freq_netparam <- ggplot(freq_df, aes(x = freq_label, fill = freq_label)) +
	geom_bar() +labs(
	title = "Sampling Frequency by Parameter",
	x = "Sampling Frequency",
	y = "Number of Stations",
	fill = "Inferred Frequency"
	) + ylim(0,50) +
	facet_grid(network~parameter,scales = "free_y",space = "free_y")+
	theme(
	axis.text.x = element_text(angle = 45, hjust = 1)
	)

	## Sampling Cadence change
	# Sort data by station and date
	some_data <- some_data[order(some_data$station, some_data$Samp.Date), ]

	# Split by station and year
	spl <- split(some_data, list(some_data$station, some_data$Param, some_data$WY), drop = TRUE, sep = "_\|_")

	# Compute median sampling interval for each group
	median_intervals <- sapply(spl, function(x) {
	if (nrow(x) > 1) median(diff(x$Samp.Date)) else NA
	})

	# Split names into separate columns
	split_names <- do.call(rbind, strsplit(names(median_intervals), "_\\\|_"))

	freq_df2 <- data.frame(
	station = split_names[, 1],
	Param = split_names[, 2],
	WY = split_names[, 3],
	median_diff_days = median_intervals
	)
	freq_df2 <- freq_df2[!is.na(freq_df2$median_diff_days), ]
	freq_df2 <- freq_df2[order(freq_df2$station, freq_df2$Param, freq_df2$WY), ]

	freq_df2$change_flag <- ave(
	freq_df2$median_diff_days,
	interaction(freq_df2$station, freq_df2$Param),
	FUN = function(x) c(NA, abs(diff(x)) > 5)
	)
	freq_df2$change_flag <- factor(
	ifelse(is.na(freq_df2$change_flag), "No data",
	ifelse(freq_df2$change_flag == 1, "Change", "Stable")),
	levels = c("No data", "Stable", "Change")
	)

	freq_df2$cadence_group <- cut(
	freq_df2$median_diff_days,
	breaks = c(-Inf, 7, 14, 30, 90, Inf),
	labels = c("<7", "7–14", "14–30", "30–90", ">90"),
	include.lowest = TRUE
	)

	samp_interval <- ggplot(freq_df2, aes(x = WY, y = station, fill = cadence_group)) +
	geom_tile(color = "grey80") +
	facet_grid(network~ Param,scales = "free_y",space = "free_y") +
	scale_fill_viridis_d(option = "C", direction = -1, name = "Sampling Frequency") +
	scale_x_discrete(
	breaks = seq(1970, max(freq_df2$WY), by = 10) # every 5 years
	) +
	labs(
	title = "Sampling Interval by Station and Parameter",
	x = "Water Year",
	y = "Station"
	)
No results found