Skip to content

Instantly share code, notes, and snippets.

@szechno
Created February 27, 2026 20:47
Show Gist options
  • Select an option

  • Save szechno/66e65dd8887ff1f9a0ff2936b4c74e68 to your computer and use it in GitHub Desktop.

Select an option

Save szechno/66e65dd8887ff1f9a0ff2936b4c74e68 to your computer and use it in GitHub Desktop.
# check if geometry col is 'geom' and rename
rename_geom_col_as_geometry <- function(x){
if(attr(x, "sf_column") == "geom"){
st_geometry(x) <- "geometry"
}
return(x)
}
filter_dictionaries <- function(filtering_postcode,
x = dic,
y = pupils,
z = os_addresses_sf,
st = source_streets_sf,
cp = codepoint_sf){
x <- x |> filter(postcode == filtering_postcode)
y <- y |> filter(postcode == filtering_postcode)
z <- z |> filter(postcode == filtering_postcode) |>
select(ADDRESS, uprn)
st <- st |> filter(POSTCODE == filtering_postcode)
cp <- cp |> filter(postcode == filtering_postcode)
return(list(filtered_dictionary = x,
filtered_pupils = y,
filtered_os_addresses = z,
filtered_streets = st,
filtered_codepoint = cp))
}
match_addressline_postcode <- function(pupil_source = pupils,
postcodes = pupil_wsx_postcode,
os = os_addresses_sf,
streets = source_streets_sf,
postcode_index = 999,
dictionary = dic,
cpp = codepoint_sf){
postcode <- postcodes[postcode_index]
print(paste0("Now searching through postcode: ", postcode,
". Index: ", postcode_index))
tmp_dic <- filter_dictionaries(filtering_postcode = postcode,
x = dictionary,
y = pupil_source,
z = os,
st = streets,
cp = cpp)
if(anyNA(tmp_dic$filtered_pupils$FOUND)){
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ] <-
match_addressline1_2_3(
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ],
tmp_dic$filtered_os_addresses
)
}
if(anyNA(tmp_dic$filtered_pupils$FOUND)){
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ] <-
match_addressline1_2(
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ],
tmp_dic$filtered_os_addresses
)
}
if(anyNA(tmp_dic$filtered_pupils$FOUND)){
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ] <-
match_addressline2(
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ],
tmp_dic$filtered_os_addresses
)
}
if(anyNA(tmp_dic$filtered_pupils$FOUND)){
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ] <-
match_addressline1(
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ],
tmp_dic$filtered_os_addresses
)
}
if(anyNA(tmp_dic$filtered_pupils$FOUND)){
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ] <-
match_addressline3(
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ],
tmp_dic$filtered_os_addresses
)
}
if(anyNA(tmp_dic$filtered_pupils$FOUND)){
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ] <-
match_addressline1_partial(
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ],
tmp_dic$filtered_os_addresses
)
}
if(anyNA(tmp_dic$filtered_pupils$FOUND)){
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ] <-
match_addressline1_partial2(
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ],
tmp_dic$filtered_os_addresses
)
}
if(anyNA(tmp_dic$filtered_pupils$FOUND)){
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ] <-
match_addressline1_2a(
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ],
tmp_dic$filtered_os_addresses
)
}
if(anyNA(tmp_dic$filtered_pupils$FOUND)){
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ] <-
match_streets(
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ],
tmp_dic$filtered_streets
)
}
if(anyNA(tmp_dic$filtered_pupils$FOUND)){
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ] <-
match_postcode_centroid(
tmp_dic$filtered_pupils[is.na(tmp_dic$filtered_pupils$FOUND), ],
tmp_dic$filtered_codepoint
)
}
return(tmp_dic$filtered_pupils)
}
match_addressline1_2_3 <- function(filtered_pupils,
filtered_os_addresses){
for(i in 1:nrow(filtered_pupils)){
tmp <- filtered_os_addresses |> filter(
str_detect(paste0(" ", ADDRESS),
paste0(" ", filtered_pupils$addressline1[i], " ")) &
str_detect(ADDRESS,
paste0(filtered_pupils$addressline2[i], " ")) &
str_detect(ADDRESS,
paste0(filtered_pupils$addressline3[i], " "))
)
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$ADDRESS
filtered_pupils$UPRN[i] <- tmp$uprn
filtered_pupils$geometry[i] <- tmp$geometry
}
}
return(filtered_pupils)
}
match_addressline1_2 <- function(filtered_pupils,
filtered_os_addresses){
for(i in 1:nrow(filtered_pupils)){
tmp <- filtered_os_addresses |> filter(
str_detect(paste0(" ", ADDRESS),
paste0(" ", filtered_pupils$addressline1[i], " ")) &
str_detect(ADDRESS,
paste0(filtered_pupils$addressline2[i], " "))
)
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$ADDRESS
filtered_pupils$UPRN[i] <- tmp$uprn
filtered_pupils$geometry[i] <- tmp$geometry
}
}
return(filtered_pupils)
}
match_addressline2_3 <- function(filtered_pupils,
filtered_os_addresses){
for(i in 1:nrow(filtered_pupils)){
tmp <- filtered_os_addresses |> filter(
str_detect(paste0(" ", ADDRESS),
paste0(" ", filtered_pupils$addressline2[i], " ")) &
str_detect(ADDRESS,
paste0(filtered_pupils$addressline3[i], " "))
)
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$ADDRESS
filtered_pupils$UPRN[i] <- tmp$uprn
filtered_pupils$geometry[i] <- tmp$geometry
}
}
return(filtered_pupils)
}
match_addressline1_2a <- function(filtered_pupils,
filtered_os_addresses){
for(i in 1:nrow(filtered_pupils)){
tmp <- filtered_os_addresses |> filter(
str_detect(paste0(" ", ADDRESS),
paste0(" ", filtered_pupils$addressline1[i], " ", filtered_pupils$addressline2[i]))
)
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$ADDRESS
filtered_pupils$UPRN[i] <- tmp$uprn
filtered_pupils$geometry[i] <- tmp$geometry
}
}
return(filtered_pupils)
}
match_addressline1_2_3a <- function(filtered_pupils,
filtered_os_addresses){
for(i in 1:nrow(filtered_pupils)){
tmp <- filtered_os_addresses |> filter(
str_detect(paste0(" ", ADDRESS),
paste0(" ", filtered_pupils$addressline1[i], " ", filtered_pupils$addressline2[i], " ", filtered_pupils$addressline3[i]))
)
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$ADDRESS
filtered_pupils$UPRN[i] <- tmp$uprn
filtered_pupils$geometry[i] <- tmp$geometry
}
}
return(filtered_pupils)
}
match_addressline1 <- function(filtered_pupils,
filtered_os_addresses){
for(i in 1:nrow(filtered_pupils)){
tmp <- filtered_os_addresses |> filter(
str_detect(paste0(" ", ADDRESS),
paste0(" ", filtered_pupils$addressline1[i], " "))
)
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$ADDRESS
filtered_pupils$UPRN[i] <- tmp$uprn
filtered_pupils$geometry[i] <- tmp$geometry
}
}
return(filtered_pupils)
}
match_addressline2 <- function(filtered_pupils,
filtered_os_addresses){
for(i in 1:nrow(filtered_pupils)){
tmp <- filtered_os_addresses |> filter(
str_detect(ADDRESS,
paste0(filtered_pupils$addressline2[i], " "))
)
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$ADDRESS
filtered_pupils$UPRN[i] <- tmp$uprn
filtered_pupils$geometry[i] <- tmp$geometry
}
}
return(filtered_pupils)
}
match_addressline3 <- function(filtered_pupils,
filtered_os_addresses){
for(i in 1:nrow(filtered_pupils)){
tmp <- filtered_os_addresses |> filter(
str_detect(ADDRESS,
paste0(filtered_pupils$addressline3[i], " "))
)
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$ADDRESS
filtered_pupils$UPRN[i] <- tmp$uprn
filtered_pupils$geometry[i] <- tmp$geometry
}
}
return(filtered_pupils)
}
match_addressline1_partial <- function(filtered_pupils,
filtered_os_addresses){
# assume 130A HOOLINGBURY AVENUE
# should be 130A HOLLINGBURY PARK AVENUE
# match on 130A
for(i in 1:nrow(filtered_pupils)){
if(!is.na(filtered_pupils$addressline1[i])){
tmp <- filtered_os_addresses |> filter(
str_detect(paste0(" ", ADDRESS),
paste0(" ", str_split_1(filtered_pupils$addressline1[i], " ")[[1]], " "))
)
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$ADDRESS
filtered_pupils$UPRN[i] <- tmp$uprn
filtered_pupils$geometry[i] <- tmp$geometry
}
}
}
return(filtered_pupils)
}
match_addressline1_partial2 <- function(filtered_pupils,
filtered_os_addresses){
# assume 130A HOOLINGBURY AVENUE
# should be 130A HOLLINGBURY PARK AVENUE
# match on 130A
for(i in 1:nrow(filtered_pupils)){
if(
!is.na(filtered_pupils$addressline1[i]) &&
length(str_split_1(filtered_pupils$addressline1[i], " ")) > 1
){
tmp <- filtered_os_addresses |> filter(
str_detect(paste0(" ", ADDRESS),
paste0(" ", str_split_1(filtered_pupils$addressline1[i], " ")[[1]], " ")) &
str_detect(paste0(" ", ADDRESS),
paste0(" ", str_split_1(filtered_pupils$addressline1[i], " ")[[2]], " "))
)
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$ADDRESS
filtered_pupils$UPRN[i] <- tmp$uprn
filtered_pupils$geometry[i] <- tmp$geometry
}
}
}
return(filtered_pupils)
}
match_streets <- function(filtered_pupils,
filtered_streets){
for(i in 1:nrow(filtered_pupils)){
tmp <- filtered_streets[
str_detect(filtered_pupils$FIND[[i]], filtered_streets$street_description),
] |>
slice(1)
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$POSTCODE
filtered_pupils$UPRN[i] <- paste0("usrn: ", tmp$usrn)
filtered_pupils$geometry[i] <- tmp$geometry
}
}
return(filtered_pupils)
}
match_postcode_centroid <- function(filtered_pupils,
filtered_codepoint_point){
for(i in 1:nrow(filtered_pupils)){
tmp <- filtered_codepoint_point[
filtered_codepoint_point$postcode == filtered_pupils$postcode[[i]],
]
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$postcode
filtered_pupils$UPRN[i] <- paste0("postcode centroid")
filtered_pupils$geometry[i] <- tmp$geometry
}
}
return(filtered_pupils)
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
match_bad_postcodes <- function(tmp = pupils[pupils$postcode %in% bad_postcodes, ],
bad_postcodes = pupil_bad_postcodes,
os = os_addresses_sf){
# 1
if(anyNA(tmp$FOUND)){
tmp <- match_addressline1_2_3a(tmp,
os)
}
# 2
if(anyNA(tmp$FOUND)){
tmp[is.na(tmp$FOUND), ] <- match_addressline2_3(
tmp[is.na(tmp$FOUND), ],
os)
}
# 3
if(anyNA(tmp$FOUND)){
tmp[is.na(tmp$FOUND) & is.na(tmp$addressline3), ] <- match_addressline1_2a(
tmp[is.na(tmp$FOUND) & is.na(tmp$addressline3), ],
os)
}
# 4
if(anyNA(tmp$FOUND)){
tmp[is.na(tmp$FOUND), ] <- match_addressline1_2a(
tmp[is.na(tmp$FOUND), ],
os)
}
# 5
if(anyNA(tmp$FOUND)){
tmp[is.na(tmp$FOUND), ] <- match_bad_addresses(
tmp[is.na(tmp$FOUND), ],
os)
}
return(tmp)
}
match_bad_addresses_coords <- function(filtered_pupils){
# deal with remaining...
# hardcoding google coords using FIND field
# if geometry NA after first pass, then add to list and rerun
# > str_detect("CARAVAN NUMBER 646 SHRIPNEY ROAD BOGNOR REGIS WEST SUSSEX", regex("(?=.*CARAVAN)(?=.*SHRIPNEY)"))
# [1] TRUE
# use regex("(?=.*WORD1)(?=.*WORD2)") ----
for(i in 1:nrow(filtered_pupils)){
if(
str_detect(filtered_pupils$FIND[i], regex("(?=.*ARUN)(?=.*SOUTHGATE)(?=.*CRAWLEY)"))){coords <- "51.11906847894861, -0.1707827739334607"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*BOX)(?=.*836)(?=.*CHICHESTER)"))){coords <- "50.830872206821525, -0.781349873296174"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*CARAVAN)(?=.*SHRIPNEY)(?=.*BOGNOR)"))){coords <- "50.806370862342156, -0.666664238229508"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*CELADINE)(?=.*WORTHING)"))){coords <- "50.834463518349715, -0.4260124309691420"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*COMPTON)(?=.*FELPHAM)(?=.*BOGNOR)"))){coords <- "50.79861621917546, -0.629707702134062"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*COPTHORNE)(?=.*HOTEL)"))){coords <- "51.1324694216759, -0.1243449163326338"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*EAGLE)(?=.*PEARY)(?=.*HORSHAM)"))){coords <- "51.08357236456843, -0.316855902122193"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*EAST)(?=.*LODGE)(?=.*FARLINGTON)"))){coords <- "50.84763052628856, -1.025377446314816"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*FARLINGTON)(?=.*PORTSMOUTH)"))){coords <- "50.84410370459923, -1.031579939127228"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*FERON)(?=.*CRAWLEY)"))){coords <- "51.10766900435257, -0.167166830957726"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*FROXFIELD)(?=.*HAVANT)"))){coords <- "50.87008847015443, -0.966047346313887"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*GEORGES)(?=.*LONGMERE)(?=.*CRAWLEY)"))){coords <- "51.12319868624724, -0.189708830957101"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*HEATHER)(?=.*BROADFIELD)(?=.*CRAWLEY)"))){coords <- "51.10200385033542, -0.2070057581695635"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*HEIDELBERG)(?=.*SARK)"))){coords <- "49.43232853923432, -2.360690090714060"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*HILL)(?=.*ROAD)(?=.*STEYNING)"))){coords <- "50.88664414405455, -0.332168859803523"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*HOLM)(?=.*FARM)(?=.*CASTLE)"))){coords <- "50.88392649702428, -0.931648002130512"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*LIMESTONE)(?=.*ROAD)(?=.*CHICHESTER)"))){coords <- "50.83892687401044, -0.7511219739466515"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*MILL)(?=.*CRESCENT)(?=.*LITTLEHAMPTON)"))){coords <- "50.82750925594101, -0.541380917072307"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*SOUNDWELL)(?=.*BRISTOL)"))){coords <- "51.47501690452455, -2.50726932209139"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*SOUTHWICK)(?=.*BRIGHTON)"))){coords <- "50.83621172415037, -0.2329973807649555"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*SOUTHWICK)(?=.*EAST)(?=.*SUSSEX)"))){coords <- "50.83621172415037, -0.2329973807649555"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*STREET)(?=.*TOWN)(?=.*STATE)"))){coords <- "50.834981913149385, -0.330367395506445"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*TILLER)(?=.*CRESCENT)(?=.*BOGNOR)"))){coords <- "50.77449395715705, -0.735300203990987"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*TRAVELODGE)(?=.*PEGLER)"))){coords <- "51.11771840116212, -0.1898663021207412"
} else if(str_detect(filtered_pupils$FIND[i], regex("(?=.*WOODFIELD)(?=.*BURGESS)(?=.*HILL)"))){coords <- "50.95144970748227, -0.1103184590922752"
} else {
coords <- NA
}
print(paste0("coords : ", coords))
if(!is.na(coords)){
filtered_pupils$FOUND[i] <- "google"
filtered_pupils$UPRN[i] <- "coords"
filtered_pupils$geometry[i] <- transform_latlng_bng(coords)
} else {
filtered_pupils$FOUND[i] <- NA
filtered_pupils$UPRN[i] <- NA
filtered_pupils$geometry[i] <- NA
}
}
return(filtered_pupils)
}
match_bad_addresses <- function(filtered_pupils,
filtered_os_addresses = os_addresses_sf){
## create additions to match_bad_addresses:-
# for(i in 1:nrow(withy[is.na(withy$FOUND), ])){
# cat(paste0(" } else if(str_detect(filtered_pupils$FIND[i],\n",
# " \"", withy[is.na(withy$FOUND), ]$FIND[i], "\")\n",
# " ){\n",
# " search_term <- \"XXXXXXXXXXXX\"\n"))
# }
for(i in 1:nrow(filtered_pupils)){
search_term <- ""
if(str_detect(filtered_pupils$FIND[i],
"WITHY")){
search_term <- str_c(str_extract(filtered_pupils$FIND[i], "[:digit:]{1,3}"),
" NEW WITHY PARK HONEYMANS PLACE LANCING")
} else if(str_detect(filtered_pupils$FIND[i],
"PLOT")
){
search_term <- str_c(
filtered_pupils$addressline1[i] |> str_replace("PLOT", ""),
filtered_pupils$addressline2[i])
} else if(str_detect(filtered_pupils$FIND[i],
"FLAT")
){
search_term <- str_c(
filtered_pupils$addressline1[i] |> str_replace("FLAT", ""),
filtered_pupils$addressline2[i])
}
print(paste0("search term : ", search_term))
tmp <- filtered_os_addresses |>
filter(str_detect(paste0(" ", ADDRESS),
paste0(" ", search_term)))
if(nrow(tmp) == 1){
filtered_pupils$FOUND[i] <- tmp$ADDRESS
filtered_pupils$UPRN[i] <- tmp$uprn
filtered_pupils$geometry[i] <- tmp$geometry
}
}
return(filtered_pupils)
}
transform_latlng_bng <- function(latlng_as_string){
# latlng as output by google maps, ie backwards
latlng = str_split_1(latlng_as_string, ", ")
point <- st_point(x = c(as.numeric(latlng[2]),
as.numeric(latlng[1])))
sfc = st_sfc(point, crs = 4326)
point <- sfc |> st_transform(27700)
return(point)
}
make_pupils_sf <- function(x){
x <- x |> st_as_sf() |> st_set_crs(27700)
}
# copied but not yet used ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drop_na_char_vector <- function(x){
x <- x[!is.na(x)]
if(length(x) == 0) x <- NA else x
return(x)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment