Skip to content

Instantly share code, notes, and snippets.

@szechno
Created February 27, 2026 16:09
Show Gist options
  • Select an option

  • Save szechno/6dcb43bf6f45079744bfbb0f39f931aa to your computer and use it in GitHub Desktop.

Select an option

Save szechno/6dcb43bf6f45079744bfbb0f39f931aa to your computer and use it in GitHub Desktop.
`%nin%` <- Negate(`%in%`)
read_and_merge_osaddressbase <- function(folder_date, # = "2024-08"
filename, # = "6581692"
stub = "D:/data_store/OS.ADDRESSBASE/",
ext = ".gpkg"){
# merge two layers we need so we can intersect by wsx boundary
# where to find the geopackage and its filename
gpkg <- paste0(stub, folder_date, "/", filename, ext)
# add advisory
print(paste("Reading and merging ", gpkg))
# read two layers we need to merge to make sf
# uprn, geom
os_blpu <- st_read(gpkg,
layer = "blpu")
# uprn, etc...
os_delivery_point_address <- st_read(gpkg,
layer = "delivery_point_address")
os_all_fields <- merge(os_blpu,
os_delivery_point_address,
by.x = "uprn",
by.y = "uprn")
# usrn, etc..., geom
os_street <- st_read(gpkg,
layer = "street") |>
select(usrn, geom)
# usrn, street_description
os_street_descriptor <- st_read(gpkg,
layer = "street_descriptor") |>
select(usrn, street_description)
os_streets <- merge(os_street,
os_street_descriptor,
by.x = "usrn",
by.y = "usrn")
return(list(os_all_fields, os_streets))
}
remove_os_fields <- function(x){
# remove fields not required for making dictionaries
# but retain geometry for copying to pupils
x <- x |> select(-c("change_type.x", "logical_status", "blpu_state",
"blpu_state_date", "rpc", "local_custodian_code",
"country", "start_date.x", "end_date.x",
"last_update_date.x", "entry_date.x",
"addressbase_postal", "postcode_locator",
"multi_occ_count", "change_type.y",
"postcode_type", "delivery_point_suffix",
"welsh_dependent_thoroughfare","welsh_thoroughfare",
"welsh_double_dependent_locality",
"welsh_dependent_locality",
"welsh_post_town",
"process_date", "start_date.y", "end_date.y",
"last_update_date.y", "entry_date.y","parent_uprn",
"x_coordinate", "y_coordinate", "latitude", "longitude",
"udprn"))
x$uprn <- as.character(x$uprn)
return(x)
}
add_os_ADDRESS_field <- function(x){
x <- x |> mutate(ADDRESS =
str_c(
str_replace_na(x$organisation_name,
replacement = ""),
" ",
str_replace_na(x$department_name,
replacement = ""),
" ",
str_replace_na(x$po_box_number,
replacement = ""),
" ",
str_replace_na(x$sub_building_name,
replacement = ""),
" ",
str_replace_na(x$building_name,
replacement = ""),
" ",
str_replace_na(x$building_number,
replacement = ""),
" ",
str_replace_na(x$dependent_thoroughfare,
replacement = ""),
" ",
str_replace_na(x$thoroughfare,
replacement = ""),
" ",
str_replace_na(x$double_dependent_locality,
replacement = ""),
" ",
str_replace_na(x$dependent_locality,
replacement = ""),
" ",
str_replace_na(x$post_town,
replacement = ""),
" ",
str_replace_na(x$postcode,
replacement = "")
) |> str_squish()
)
}
add_pupils_ADDRESS_field <- function(x){
x <- x |> mutate(FIND =
str_c(
str_replace_na(x$addressline1,
replacement = ""),
" ",
str_replace_na(x$addressline2,
replacement = ""),
" ",
str_replace_na(x$addressline3,
replacement = ""),
" ",
str_replace_na(x$addressline4,
replacement = ""),
" ",
str_replace_na(x$addressline5,
replacement = "")
) |> str_squish(),
FOUND = NA,
UPRN = NA,
geometry = NA
)
}
read_wsx_boundary <- function(shape = paste0("D:/data_store/Boundaries/county/",
"county.shp"),
buffer = 10000){
wsx <- read_sf(shape)
wsx <- wsx |> st_buffer(dist = buffer) |> select(geometry)
}
create_dictionary <- function(x, var){
# create dictionary
# used by create_dictionaries function
dic <- x |>
select({{ var }}, postcode) |>
drop_na({{ var }}) |>
unique() |>
arrange({{ var }}) |>
rename(term = {{ var }})
}
collapse_dictionaries <- function(x){
# "dic_sub_building_name",
# "dic_building_name",
## "dic_building_number",
# "dic_thoroughfare",
# "dic_dependent_thoroughfare",
# "dic_double_dependent_locality",
# "dic_post_town",
# "dic_county"
x |> list_rbind()
}
create_dictionaries <- function(x){
# create all dictionaries
# > names(source_wsx)
# [1] "organisation_name" "department_name" "sub_building_name" "building_name"
# [5] "building_number" "dependent_thoroughfare" "thoroughfare" "double_dependent_locality"
# * building_name
dic_sub_building_name <- x |> create_dictionary(sub_building_name) |>
arrange() |> unique()
# * building_name
dic_building_name <- x |> create_dictionary(building_name) |>
arrange() |> unique()
# * building_number ... really necessary?
# as.numeric(dictionaries$dic_building_number$term) |> is.na() |> table()
# FALSE
# 297132
# dic_building_number <- x |> create_dictionary(building_number) |>
# arrange() |> unique()
# * county
dic_county <- create_dictionaries_counties()
# * posttown
dic_post_town <- x |> create_dictionary(post_town) |>
arrange() |> unique()
# * locality
dic_double_dependent_locality <- x |>
create_dictionary(double_dependent_locality) |>
arrange() |> unique()
dic_dependent_locality <- x |>
create_dictionary(dependent_locality) |>
arrange() |> unique()
# combine character vectors and remove subsequent duplicates
# dic_locality <- c(dic_dependent_locality, dic_double_dependent_locality) |>
# sort() |> unique()
# dic_locality
# * street
dic_thoroughfare <- x |>
create_dictionary(thoroughfare) |>
arrange() |> unique()
dic_dependent_thoroughfare <- x |>
create_dictionary(dependent_thoroughfare) |>
arrange() |> unique()
# dic_street <- c(dic_thoroughfare, dic_dependent_thoroughfare) |>
# sort() |> unique()
# return list
return(list(#dic_street = dic_street,
dic_sub_building_name = dic_sub_building_name,
dic_building_name = dic_building_name,
# dic_building_number = dic_building_number,
dic_thoroughfare = dic_thoroughfare,
dic_dependent_thoroughfare = dic_dependent_thoroughfare,
dic_double_dependent_locality = dic_double_dependent_locality,
# dic_locality = dic_locality,
dic_post_town = dic_post_town,
dic_county = dic_county))
}
create_dictionaries_counties <- function(
gpkg_path = "D:/data_store/OS Boundary-Line/bdline_gb_2024-03-26.gpkg"){
# special case for counties
county <- st_read(gpkg_path,
layer = "county") |>
st_drop_geometry() |>
select(Name)
county <- county$Name |> str_replace(" County", "")
county <- county[!county == "Greater London Authority"]
# district_borough_unitary
dbs <- st_read(gpkg_path,
layer = "district_borough_unitary") |>
st_drop_geometry() |>
select(Name)
# dbs$Name <- dbs$Name |> str_replace(" District (B)", "")
dbs$Name <- dbs$Name |> str_replace(" District", "")
dbs$Name <- dbs$Name |> str_replace(" London Boro", "")
dbs$Name <- dbs$Name |> str_replace("City of ", "")
dbs <- dbs$Name |> str_replace(" \\(B\\)", "")
dbs <- c(dbs, "Bedfordshire", "County of Bedford", "Beds",
"Berkshire", "County of Berks", "Berks",
"Buckinghamshire", "County of Buckingham", "Bucks",
"Cambridgeshire", "County of Cambridge", "Cambs",
"Cheshire", "County of Chester", "Ches", "Cornwall", "Corn",
"Cumberland", "Cumb", "Derbyshire", "County of Derby", "Derbys",
"Derbs", "Devon", "Devonshire", "Dev", "Dorset", "Dorsetshire",
"Dor", "Durham", "County of Durham", "Co Dur", "Essex",
"Gloucestershire", "County of Gloucester", "Glos", " Gloucs",
"Hampshire", "County of Southampton", "Hants",
"Southamptonshire", "Herefordshire", "County of Hereford", "Here",
"Heref", "Hertfordshire", "County of Hertford", "Herts",
"Huntingdonshire", "County of Huntingdon", "Hunts", "Kent",
"Lancashire", "County of Lancaster", "Lancs",
"Leicestershire", "County of Leicester", "Leics",
"Lincolnshire", "County of Lincoln", "Lincs",
"Middlesex", "Mx", "Middx", "Mddx", "M'sex",
"Norfolk", "Norf", "Northamptonshire", "County of Northampton",
"Northants", "Northumberland", "Northumb", "Northd",
"Nottinghamshire", "County of Nottingham", "Notts",
"Oxfordshire", "County of Oxford", "Oxon",
"Rutland", "Rutlandshire", "Rut", "Shropshire", "County of Salop",
"Shrops", "Salop", "Somerset", "Somersetshire", "Som",
"Staffordshire", "County of Stafford", "Staffs", " Staf",
"Suffolk", "Suff", "Surrey", "Sy", "Sussex", "Sx", " Ssx",
"Warwickshire", "County of Warwick", "Warks", " War", "Warw",
"Westmorland", "Westm", "Wiltshire", "County of Wilts", "Wilts",
"Worcestershire", "County of Worcester", "Worcs", "Worsts",
"Yorkshire", "County of York", "Yorks"
)
dbs <- dbs |> unique() |> str_to_upper() |> trimws() |> sort()
tibble(term = dbs, postcode = "")
}
create_postcodes <- function(x){
# get a list of all relevant postcodes
# to filter addresses on and confirm matches
x$postcode |> unique() |> sort()
}
rename_pupil_fields <- function(x){
# Correct field names for pupils
# from summer2024 and powerBI dashboard:
x <- x |> rename(
"NativeId" = "DfE(7)",
# "pupilonrolltableid" = "pupilonrolltableid",
"pupilonrolltableid" = "PupilID",
# "schoolname" = "School/Academy",
"schoolname" = "School Name",
"enrolstatus" = "Enrolment Status",
# "postcode" = "postcode",
"postcode" = "Postcode",
"saon" = "saon",
"paon" = "paon",
"street" = "street",
"locality" = "locality",
"town" = "town",
"administrativearea" = "administrativearea",
"posttown" = "posttown",
"addressline1" = "addressline1",
"addressline2" = "addressline2",
"addressline3" = "addressline3",
"addressline4" = "addressline4",
"addressline5" = "addressline5")
# make NativeId a character vector
x$NativeId <- as.character(x$NativeId)
return(x)
}
tidy_fields <- function(x){
# run on both pupil address fields
# and on OS fields for consistent
# approach
# see also original `address_hyphen_to_na`
x |>
str_to_upper() |>
str_replace_all("-- SELECT --", "")|>
str_replace_all("- ", "") |>
str_replace_all(regex( "[^[:alnum:][:blank:][-]]"), "") |>
str_squish()
}
tidy_fields_na <- function(x){
# any fields with just an empty string become NA
# compare x to empty string, NA if TRUE
na_if(x, "")
}
tidy_all_fields <- function(x, start, finish){
# mapping function for all fields
x <- x |> mutate(across({{ start }}:{{ finish }}, tidy_fields))
x <- x |> mutate(across({{ start }}:{{ finish }}, tidy_fields_na))
}
# NOTE: maybe redundant due to change of direction... delete if so:
add_fields_pupils <- function(x, y = os_wsx_addresses){
# [1] "organisation_name" "department_name" "sub_building_name" "building_name"
# [5] "building_number" "dependent_thoroughfare" "thoroughfare" "double_dependent_locality"
# [9] "dependent_locality" "post_town" "postcode" "po_box_number"
tmp <- names(y)
tmp <- tmp[!tmp == "postcode"]
tmp <- c(tmp, "easting", "northing")
x <- x |> mutate(!!!setNames(rep(NA, length(tmp)), tmp))
return(x)
}
move_addresses_to_addressline_fields <- function(x){
# take entire dataframe and subset either side
# simply copy values from saon:town to addressline1:5
# NOTE: ignores administrativearea and posttown
# move saon where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
truthy_pupils <- !is.na(x$saon) & is.na(x$addressline1)
x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$saon
x[truthy_pupils, ]$saon <- NA
# move paon where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## to al1 if NA
truthy_pupils <- !is.na(x$paon) & is.na(x$addressline1)
x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$paon
x[truthy_pupils, ]$paon <- NA
## to al2 if al1 is full
truthy_pupils <- !is.na(x$paon) & !is.na(x$addressline1) & is.na(x$addressline2)
x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$paon
x[truthy_pupils, ]$paon <- NA
# move street where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## to al1 if NA
truthy_pupils <- !is.na(x$street) & is.na(x$addressline1)
x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$street
x[truthy_pupils, ]$street <- NA
## to al2 if al1 is full
truthy_pupils <- !is.na(x$street) & !is.na(x$addressline1) & is.na(x$addressline2)
x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$street
x[truthy_pupils, ]$street <- NA
## to al3 if al1 and al2 full
truthy_pupils <- !is.na(x$street) & !is.na(x$addressline1) & !is.na(x$addressline2) & is.na(x$addressline3)
x[truthy_pupils, ]$addressline3 <- x[truthy_pupils, ]$street
x[truthy_pupils, ]$street <- NA
# move locality where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## to al1 if NA
truthy_pupils <- !is.na(x$locality) & is.na(x$addressline1)
x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$locality
x[truthy_pupils, ]$locality <- NA
## to al2 if al1 is full
truthy_pupils <- !is.na(x$locality) & !is.na(x$addressline1) & is.na(x$addressline2)
x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$locality
x[truthy_pupils, ]$locality <- NA
## to al3 if al1 and al2 full
truthy_pupils <- !is.na(x$locality) & !is.na(x$addressline1) & !is.na(x$addressline2) & is.na(x$addressline3)
x[truthy_pupils, ]$addressline3 <- x[truthy_pupils, ]$locality
x[truthy_pupils, ]$locality <- NA
## to al4 if al1 and al2 and al3 full
truthy_pupils <- !is.na(x$locality) & !is.na(x$addressline1) & !is.na(x$addressline2) & !is.na(x$addressline3) & is.na(x$addressline4)
x[truthy_pupils, ]$addressline4 <- x[truthy_pupils, ]$locality
x[truthy_pupils, ]$locality <- NA
# move town where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## to al1 if NA
truthy_pupils <- !is.na(x$town) & is.na(x$addressline1)
x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$town
x[truthy_pupils, ]$town <- NA
## to al2 if al1 is full
truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & is.na(x$addressline2)
x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$town
x[truthy_pupils, ]$town <- NA
## to al3 if al1 and al2 full
truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & !is.na(x$addressline2) & is.na(x$addressline3)
x[truthy_pupils, ]$addressline3 <- x[truthy_pupils, ]$town
x[truthy_pupils, ]$town <- NA
## to al4 if al1 and al2 and al3 full
truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & !is.na(x$addressline2) & !is.na(x$addressline3) & is.na(x$addressline4)
x[truthy_pupils, ]$addressline4 <- x[truthy_pupils, ]$town
x[truthy_pupils, ]$town <- NA
## to al5 if al1 and al2 and al3 and al4 full
truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & !is.na(x$addressline2) & !is.na(x$addressline3) & !is.na(x$addressline4) & is.na(x$addressline5)
x[truthy_pupils, ]$addressline5 <- x[truthy_pupils, ]$town
x[truthy_pupils, ]$town <- NA
truthy_pupils <- !is.na(x$saon) | !is.na(x$paon) |
!is.na(x$street) | !is.na(x$locality) |
!is.na(x$town)
print(paste0("Observation where saon, paon, street, locality, ",
"town have not been moved: ",
x[truthy_pupils, ] |> nrow()
)
)
x$administrativearea <- NA
x$posttown <- NA
return(x)
}
drop_columns_where_all_NA <- function(x){
# drop columns where all is NA
x <- x |> select_if(~ !all(is.na(.)))
}
drop_rows_where_not_938 <- function(x){
# drop rows where NativeId not 938****[[]]
x <- x |> filter(str_detect(NativeId, "938"))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment