library(targets)
meta_pattern <- tar_meta(son_files)
meta_branches <- tar_meta(all_of(meta_pattern$children[[1]]))Total number of branches:
nrow(meta_branches)
#> [1] 2525In the last pipeline run, almost all branches them were skipped (so presumably the mtime was correct at the time)
(skipped <- readLines("log_old.txt") |>
grepl(pattern = "skipped branch son_files_") |>
sum())
#> [1] 2407and these were completed (so the mtime should be correct).
(completed <- readLines("log_old.txt") |>
grepl(pattern = "completed branch son_files_") |>
sum())
#> [1] 118So all branches were either valid at the time (and thus untouched) or completed (and thus valid now)
(skipped + completed) == nrow(meta_branches)
#> [1] TRUEHowever…
mtime_meta <- meta_branches$time
mtime_fs <- file.info(unlist(meta_branches$path))$mtimeOnly this many have the correct mtime as reported by the OS:
sum(mtime_meta == mtime_fs)
#> [1] 71A lot of that might come from imprecise timestamps (#1062), but even rounding down to the second, a lot of them are still wrong.
sum(round.POSIXt(mtime_fs, "secs") != round.POSIXt(mtime_meta, "secs"))
#> [1] 118The time difference is of the order of many hours!
mtime_wrong <- round.POSIXt(mtime_fs, "secs") != round.POSIXt(mtime_meta, "secs")
mtime_dif <- mtime_fs - mtime_meta
as.numeric(mtime_dif[mtime_wrong])/60/60 # in hours
#> [1] 0.8766218 0.8764618 0.8767785 0.8767552 0.8760240 0.8652707
#> [7] 0.8613741 0.8580108 0.8634774 0.8509197 0.8386586 0.8660096
#> [13] 0.8618319 0.8566263 0.8526330 0.8566808 0.8504008 0.8424230
#> [19] 0.8406642 0.8475164 0.8462175 0.8470552 0.8462764 0.8400230
#> [25] 0.8344331 0.8352508 0.8389686 0.8284764 0.8314520 0.8356231
#> [31] 0.8365664 0.8389264 0.8387764 0.8391230 0.8413397 0.8432875
#> [37] 0.8424341 0.8426886 0.8437064 0.8437297 0.8406775 0.8413519
#> [43] 0.8420853 0.8458286 0.8452608 0.8425141 0.8420686 0.8435275
#> [49] 0.8432397 0.8430841 0.8421041 0.8425453 0.8416064 0.8434219
#> [55] 0.8450886 0.8428941 0.8419553 0.8416753 0.8427153 0.8430153
#> [61] 0.8449797 0.8424986 0.8423841 0.8431497 0.8438764 0.8432797
#> [67] 0.8436130 0.8453086 0.8451864 0.8462797 0.8438764 27.6917514
#> [73] 27.6869926 27.6777637 27.6819226 27.6721937 27.6830737 27.6595793
#> [79] 27.6598026 27.6839748 27.2808404 27.1963351 27.1773229 27.1156931
#> [85] 26.9690779 26.9058070 26.8500683 26.8720260 26.8738238 26.8668882
#> [91] 26.8358261 26.8368783 26.8018940 25.7692512 25.7452468 25.7411891
#> [97] 25.7378546 25.7086814 25.6999959 25.6861970 25.6713993 25.6564516
#> [103] 25.5574285 25.5482619 23.4522801 23.4468690 23.4171902 23.4622778
#> [109] 23.4422956 23.0511346 23.0648101 23.0668134 23.0658267 23.0727512
#> [115] 23.0728178 23.0730556 23.0687045 23.0589312All the problematic files stored an earlier timestamp than the one reported by the OS. Some don’t even have the correct size
size_meta <- meta_branches$bytes
size_fs <- file.size(unlist(meta_branches$path))
sum(size_fs != size_meta)
#> [1] 32Some files are recorded as larger, some as smaller.
sum(size_fs < size_meta)
#> [1] 26
sum(size_fs > size_meta)
#> [1] 6All the files with the correct timestamp have the correct size, but only some of the ones with the wrong timestamp have the wrong size.
size_wrong <- size_fs != size_meta
table(mtime_wrong, size_wrong)
#> size_wrong
#> mtime_wrong FALSE TRUE
#> FALSE 2407 0
#> TRUE 86 32A bunch of branches are instantly outdated
outdated <- targets::tar_outdated(son_files, shortcut = TRUE, branches = TRUE)
is_outdated <- meta_branches$name %in% outdated
outdated <- meta_branches$name[is_outdated]
sum(is_outdated)
#> [1] 118Which are the ones with the incorrect mtime
table(is_outdated, mtime_wrong)
#> mtime_wrong
#> is_outdated FALSE TRUE
#> FALSE 2407 0
#> TRUE 0 118All the outdated branches are among the skipped branches!
lines <- readLines("log_old.txt")
skipped_names <- lines[grepl(pattern = "skipped branch son_files_", x = lines)] |>
stringr::str_extract("son_files_\\S+")
all(outdated %in% skipped_names)
#> [1] TRUENaturally, the hash is wrong:
hash_fs <- vapply(unlist(meta_branches$path)[is_outdated],
\(x) digest::digest(file = x, algo = "xxhash64"),
FUN.VALUE = character(1)) |>
unname()
sum(meta_branches[is_outdated, ]$data == hash_fs)
#> [1] 0It is very suspicious that the number of completed branches is the same as the number of currently outdated branches, which would be the number of compelted branches the next time I run the pipeline.
was_outdated <- !(meta_branches$name %in% skipped_names)
table(was_outdated, is_outdated)
#> is_outdated
#> was_outdated FALSE TRUE
#> FALSE 2289 118
#> TRUE 118 0The conclusion is that after the run before this one, 118 were outdated. This run re-run all those 118 branches, which are now up-to-date, but “created” another 118 outdated branches, that will be run the next time I run the pipeline. If this is not a coincidence, then the next time I run the pipeline, it should run the 118 outdated branches and create 118 outdated branches. Let’s try it.
Created on 2024-05-05 with reprex v2.0.2