szechno/functions_gpr_00_creating.R

## functions_gpr_00_creating.R
`%nin%` <- Negate(`%in%`)

read_and_merge_osaddressbase <- function(folder_date,             # = "2024-08"
                                         filename,                # = "6581692"
                                         stub = "D:/data_store/OS.ADDRESSBASE/",
                                         ext = ".gpkg"){
  # merge two layers we need so we can intersect by wsx boundary
  # where to find the geopackage and its filename
  gpkg <- paste0(stub, folder_date, "/", filename, ext)
  # add advisory
  print(paste("Reading and merging ", gpkg))
  # read two layers we need to merge to make sf
  # uprn, geom
  os_blpu <- st_read(gpkg,
                     layer = "blpu")
  # uprn, etc...
  os_delivery_point_address <- st_read(gpkg,
                                       layer = "delivery_point_address")
  os_all_fields <- merge(os_blpu,
                         os_delivery_point_address,
                         by.x = "uprn",
                         by.y = "uprn")
  # usrn, etc..., geom
  os_street <- st_read(gpkg,
                       layer = "street")  |>
    select(usrn, geom)
  # usrn, street_description
  os_street_descriptor <- st_read(gpkg,
                                  layer = "street_descriptor") |>
    select(usrn, street_description)
  os_streets <- merge(os_street,
                         os_street_descriptor,
                         by.x = "usrn",
                         by.y = "usrn")
  return(list(os_all_fields, os_streets))
}

remove_os_fields <- function(x){
  # remove fields not required for making dictionaries
  # but retain geometry for copying to pupils
  x <- x |> select(-c("change_type.x", "logical_status", "blpu_state",
                        "blpu_state_date", "rpc", "local_custodian_code",
                        "country", "start_date.x", "end_date.x",
                        "last_update_date.x", "entry_date.x",
                        "addressbase_postal", "postcode_locator",
                        "multi_occ_count", "change_type.y",
                        "postcode_type", "delivery_point_suffix",
                        "welsh_dependent_thoroughfare","welsh_thoroughfare",
                        "welsh_double_dependent_locality",
                        "welsh_dependent_locality",
                        "welsh_post_town",
                        "process_date", "start_date.y", "end_date.y",
                        "last_update_date.y", "entry_date.y","parent_uprn",
                        "x_coordinate", "y_coordinate", "latitude", "longitude",
                        "udprn"))
  x$uprn <- as.character(x$uprn)
  return(x)
}

add_os_ADDRESS_field <- function(x){
  x <- x |> mutate(ADDRESS =
                     str_c(
                       str_replace_na(x$organisation_name,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$department_name,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$po_box_number,
                                      replacement = ""),
                       " ",

                       str_replace_na(x$sub_building_name,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$building_name,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$building_number,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$dependent_thoroughfare,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$thoroughfare,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$double_dependent_locality,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$dependent_locality,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$post_town,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$postcode,
                                      replacement = "")
                     ) |> str_squish()
                   )
}

add_pupils_ADDRESS_field <- function(x){
  x <- x |> mutate(FIND =
                     str_c(
                       str_replace_na(x$addressline1,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$addressline2,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$addressline3,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$addressline4,
                                      replacement = ""),
                       " ",
                       str_replace_na(x$addressline5,
                                      replacement = "")
                   ) |> str_squish(),
                   FOUND = NA,
                   UPRN = NA,
                   geometry = NA
  )
}

read_wsx_boundary <- function(shape = paste0("D:/data_store/Boundaries/county/",
                                             "county.shp"),
                              buffer = 10000){
  wsx <- read_sf(shape)
  wsx <- wsx |> st_buffer(dist = buffer) |> select(geometry)
}

create_dictionary <- function(x, var){
  # create dictionary
  # used by create_dictionaries function
  dic <- x |>
    select({{ var }}, postcode) |>
    drop_na({{ var }}) |>
    unique() |>
    arrange({{ var }}) |>
    rename(term = {{ var }})
}

collapse_dictionaries <- function(x){
  # "dic_sub_building_name",
  # "dic_building_name",
  ## "dic_building_number",
  # "dic_thoroughfare",
  # "dic_dependent_thoroughfare",
  # "dic_double_dependent_locality",
  # "dic_post_town",
  # "dic_county"
  x |> list_rbind()
}

create_dictionaries <- function(x){
  # create all dictionaries

  # > names(source_wsx)
  # [1] "organisation_name"         "department_name"           "sub_building_name"         "building_name"
  # [5] "building_number"           "dependent_thoroughfare"    "thoroughfare"              "double_dependent_locality"

  # * building_name
  dic_sub_building_name <- x |> create_dictionary(sub_building_name) |>
    arrange() |> unique()

  # * building_name
  dic_building_name <- x |> create_dictionary(building_name) |>
    arrange() |> unique()

  # * building_number ... really necessary?
  # as.numeric(dictionaries$dic_building_number$term) |> is.na() |> table()
  #  FALSE
  # 297132
  # dic_building_number <- x |> create_dictionary(building_number) |>
  #   arrange() |> unique()

  # * county
  dic_county <- create_dictionaries_counties()

  # * posttown
  dic_post_town <- x |> create_dictionary(post_town) |>
    arrange() |> unique()

  # * locality
  dic_double_dependent_locality <- x |>
    create_dictionary(double_dependent_locality) |>
    arrange() |> unique()

  dic_dependent_locality <- x |>
    create_dictionary(dependent_locality) |>
    arrange() |> unique()

  # combine character vectors and remove subsequent duplicates
  # dic_locality <- c(dic_dependent_locality, dic_double_dependent_locality) |>
  #   sort() |> unique()
  # dic_locality

  # * street
  dic_thoroughfare <- x |>
    create_dictionary(thoroughfare) |>
    arrange() |> unique()

  dic_dependent_thoroughfare <- x |>
    create_dictionary(dependent_thoroughfare) |>
    arrange() |> unique()

  # dic_street <- c(dic_thoroughfare, dic_dependent_thoroughfare) |>
  #   sort() |> unique()


  # return list
  return(list(#dic_street = dic_street,
    dic_sub_building_name = dic_sub_building_name,
    dic_building_name = dic_building_name,
    # dic_building_number = dic_building_number,
    dic_thoroughfare = dic_thoroughfare,
    dic_dependent_thoroughfare = dic_dependent_thoroughfare,
    dic_double_dependent_locality = dic_double_dependent_locality,
    # dic_locality = dic_locality,
    dic_post_town = dic_post_town,
    dic_county = dic_county))
}

create_dictionaries_counties <- function(
    gpkg_path = "D:/data_store/OS Boundary-Line/bdline_gb_2024-03-26.gpkg"){
  # special case for counties
  county <- st_read(gpkg_path,
                    layer = "county") |>
    st_drop_geometry() |>
    select(Name)
  county <- county$Name |> str_replace(" County", "")
  county <- county[!county == "Greater London Authority"]

  # district_borough_unitary
  dbs <- st_read(gpkg_path,
                 layer = "district_borough_unitary") |>
    st_drop_geometry() |>
    select(Name)
  # dbs$Name <- dbs$Name |> str_replace(" District (B)", "")
  dbs$Name <- dbs$Name |> str_replace(" District", "")
  dbs$Name <- dbs$Name |> str_replace(" London Boro", "")
  dbs$Name <- dbs$Name |> str_replace("City of ", "")
  dbs <- dbs$Name |> str_replace(" \\(B\\)", "")
  dbs <- c(dbs, "Bedfordshire", "County of Bedford", "Beds",
           "Berkshire", "County of Berks", "Berks",
           "Buckinghamshire", "County of Buckingham", "Bucks",
           "Cambridgeshire", "County of Cambridge", "Cambs",
           "Cheshire", "County of Chester", "Ches", "Cornwall", "Corn",
           "Cumberland", "Cumb", "Derbyshire", "County of Derby", "Derbys",
           "Derbs", "Devon", "Devonshire", "Dev", "Dorset", "Dorsetshire",
           "Dor", "Durham", "County of Durham", "Co Dur", "Essex",
           "Gloucestershire", "County of Gloucester", "Glos", " Gloucs",
           "Hampshire", "County of Southampton", "Hants",
           "Southamptonshire", "Herefordshire", "County of Hereford", "Here",
           "Heref", "Hertfordshire", "County of Hertford", "Herts",
           "Huntingdonshire", "County of Huntingdon", "Hunts", "Kent",
           "Lancashire", "County of Lancaster", "Lancs",
           "Leicestershire", "County of Leicester", "Leics",
           "Lincolnshire", "County of Lincoln", "Lincs",
           "Middlesex", "Mx", "Middx", "Mddx", "M'sex",
           "Norfolk", "Norf", "Northamptonshire", "County of Northampton",
           "Northants", "Northumberland", "Northumb", "Northd",
           "Nottinghamshire", "County of Nottingham", "Notts",
           "Oxfordshire", "County of Oxford", "Oxon",
           "Rutland", "Rutlandshire", "Rut", "Shropshire", "County of Salop",
           "Shrops", "Salop", "Somerset", "Somersetshire", "Som",
           "Staffordshire", "County of Stafford", "Staffs", " Staf",
           "Suffolk", "Suff", "Surrey", "Sy", "Sussex", "Sx", " Ssx",
           "Warwickshire", "County of Warwick", "Warks", " War", "Warw",
           "Westmorland", "Westm", "Wiltshire", "County of Wilts", "Wilts",
           "Worcestershire", "County of Worcester", "Worcs", "Worsts",
           "Yorkshire", "County of York", "Yorks"
  )
  dbs <- dbs |> unique() |> str_to_upper() |> trimws() |> sort()
  tibble(term = dbs, postcode = "")
}

create_postcodes <- function(x){
  # get a list of all relevant postcodes
  # to filter addresses on and confirm matches
  x$postcode |> unique() |> sort()
}

rename_pupil_fields <- function(x){
  # Correct field names for pupils
  # from summer2024 and powerBI dashboard:
  x <- x |> rename(
    "NativeId" = "DfE(7)",
    # "pupilonrolltableid" = "pupilonrolltableid",
    "pupilonrolltableid" = "PupilID",
    # "schoolname" = "School/Academy",
    "schoolname" = "School Name",
    "enrolstatus" = "Enrolment Status",
    # "postcode" = "postcode",
    "postcode" = "Postcode",
    "saon" = "saon",
    "paon" = "paon",
    "street" = "street",
    "locality" = "locality",
    "town" = "town",
    "administrativearea" = "administrativearea",
    "posttown" = "posttown",
    "addressline1" = "addressline1",
    "addressline2" = "addressline2",
    "addressline3" = "addressline3",
    "addressline4" = "addressline4",
    "addressline5" = "addressline5")
  # make NativeId a character vector
  x$NativeId <- as.character(x$NativeId)
  return(x)
}

tidy_fields <- function(x){
  # run on both pupil address fields
  #  and on OS fields for consistent
  #  approach
  # see also original `address_hyphen_to_na`
  x |>
    str_to_upper() |>
    str_replace_all("-- SELECT --", "")|>
    str_replace_all("- ", "") |>
    str_replace_all(regex( "[^[:alnum:][:blank:][-]]"), "") |>
    str_squish()
}

tidy_fields_na <- function(x){
  # any fields with just an empty string become NA
  # compare x to empty string, NA if TRUE
  na_if(x, "")
}

tidy_all_fields <- function(x, start, finish){
  # mapping function for all fields
  x <- x |> mutate(across({{ start }}:{{ finish }}, tidy_fields))
  x <- x |> mutate(across({{ start }}:{{ finish }}, tidy_fields_na))
}

# NOTE: maybe redundant due to change of direction... delete if so:
add_fields_pupils <- function(x, y = os_wsx_addresses){
  # [1] "organisation_name"         "department_name"           "sub_building_name"         "building_name"
  # [5] "building_number"           "dependent_thoroughfare"    "thoroughfare"              "double_dependent_locality"
  # [9] "dependent_locality"        "post_town"                 "postcode"                  "po_box_number"
  tmp <- names(y)
  tmp <- tmp[!tmp == "postcode"]
  tmp <- c(tmp, "easting", "northing")
  x <- x |> mutate(!!!setNames(rep(NA, length(tmp)), tmp))
  return(x)
}

move_addresses_to_addressline_fields <- function(x){
  # take entire dataframe and subset either side
  # simply copy values from saon:town to addressline1:5
  # NOTE: ignores administrativearea and posttown

  # move saon where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  truthy_pupils <- !is.na(x$saon) & is.na(x$addressline1)
  x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$saon
  x[truthy_pupils, ]$saon <- NA

  # move paon where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  ## to al1 if NA
  truthy_pupils <- !is.na(x$paon) & is.na(x$addressline1)
  x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$paon
  x[truthy_pupils, ]$paon <- NA
  ## to al2 if al1 is full
  truthy_pupils <- !is.na(x$paon) & !is.na(x$addressline1) & is.na(x$addressline2)
  x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$paon
  x[truthy_pupils, ]$paon <- NA

  # move street where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  ## to al1 if NA
  truthy_pupils <- !is.na(x$street) & is.na(x$addressline1)
  x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$street
  x[truthy_pupils, ]$street <- NA
  ## to al2 if al1 is full
  truthy_pupils <- !is.na(x$street) & !is.na(x$addressline1) & is.na(x$addressline2)
  x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$street
  x[truthy_pupils, ]$street <- NA
  ## to al3 if al1 and al2 full
  truthy_pupils <- !is.na(x$street) & !is.na(x$addressline1) & !is.na(x$addressline2) & is.na(x$addressline3)
  x[truthy_pupils, ]$addressline3 <- x[truthy_pupils, ]$street
  x[truthy_pupils, ]$street <- NA

  # move locality where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  ## to al1 if NA
  truthy_pupils <- !is.na(x$locality) & is.na(x$addressline1)
  x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$locality
  x[truthy_pupils, ]$locality <- NA
  ## to al2 if al1 is full
  truthy_pupils <- !is.na(x$locality) & !is.na(x$addressline1) & is.na(x$addressline2)
  x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$locality
  x[truthy_pupils, ]$locality <- NA
  ## to al3 if al1 and al2 full
  truthy_pupils <- !is.na(x$locality) & !is.na(x$addressline1) & !is.na(x$addressline2) & is.na(x$addressline3)
  x[truthy_pupils, ]$addressline3 <- x[truthy_pupils, ]$locality
  x[truthy_pupils, ]$locality <- NA
  ## to al4 if al1 and al2 and al3 full
  truthy_pupils <- !is.na(x$locality) & !is.na(x$addressline1) & !is.na(x$addressline2) & !is.na(x$addressline3) & is.na(x$addressline4)
  x[truthy_pupils, ]$addressline4 <- x[truthy_pupils, ]$locality
  x[truthy_pupils, ]$locality <- NA

  # move town where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  ## to al1 if NA
  truthy_pupils <- !is.na(x$town) & is.na(x$addressline1)
  x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$town
  x[truthy_pupils, ]$town <- NA
  ## to al2 if al1 is full
  truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & is.na(x$addressline2)
  x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$town
  x[truthy_pupils, ]$town <- NA
  ## to al3 if al1 and al2 full
  truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & !is.na(x$addressline2) & is.na(x$addressline3)
  x[truthy_pupils, ]$addressline3 <- x[truthy_pupils, ]$town
  x[truthy_pupils, ]$town <- NA
  ## to al4 if al1 and al2 and al3 full
  truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & !is.na(x$addressline2) & !is.na(x$addressline3) & is.na(x$addressline4)
  x[truthy_pupils, ]$addressline4 <- x[truthy_pupils, ]$town
  x[truthy_pupils, ]$town <- NA
  ## to al5 if al1 and al2 and al3 and al4 full
  truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & !is.na(x$addressline2) & !is.na(x$addressline3) & !is.na(x$addressline4) & is.na(x$addressline5)
  x[truthy_pupils, ]$addressline5 <- x[truthy_pupils, ]$town
  x[truthy_pupils, ]$town <- NA

  truthy_pupils <- !is.na(x$saon) | !is.na(x$paon) |
    !is.na(x$street) | !is.na(x$locality) |
    !is.na(x$town)
  print(paste0("Observation where saon, paon, street, locality, ",
               "town have not been moved: ",
               x[truthy_pupils, ] |> nrow()
               )
        )
  x$administrativearea <- NA
  x$posttown <- NA
  return(x)
}

drop_columns_where_all_NA <- function(x){
  # drop columns where all is NA
  x <- x |> select_if(~ !all(is.na(.)))
}

drop_rows_where_not_938 <- function(x){
  # drop rows where NativeId not 938****[[]]
  x <- x |> filter(str_detect(NativeId, "938"))
}
	`%nin%` <- Negate(`%in%`)

	read_and_merge_osaddressbase <- function(folder_date, # = "2024-08"
	filename, # = "6581692"
	stub = "D:/data_store/OS.ADDRESSBASE/",
	ext = ".gpkg"){
	# merge two layers we need so we can intersect by wsx boundary
	# where to find the geopackage and its filename
	gpkg <- paste0(stub, folder_date, "/", filename, ext)
	# add advisory
	print(paste("Reading and merging ", gpkg))
	# read two layers we need to merge to make sf
	# uprn, geom
	os_blpu <- st_read(gpkg,
	layer = "blpu")
	# uprn, etc...
	os_delivery_point_address <- st_read(gpkg,
	layer = "delivery_point_address")
	os_all_fields <- merge(os_blpu,
	os_delivery_point_address,
	by.x = "uprn",
	by.y = "uprn")
	# usrn, etc..., geom
	os_street <- st_read(gpkg,
	layer = "street") \|>
	select(usrn, geom)
	# usrn, street_description
	os_street_descriptor <- st_read(gpkg,
	layer = "street_descriptor") \|>
	select(usrn, street_description)
	os_streets <- merge(os_street,
	os_street_descriptor,
	by.x = "usrn",
	by.y = "usrn")
	return(list(os_all_fields, os_streets))
	}

	remove_os_fields <- function(x){
	# remove fields not required for making dictionaries
	# but retain geometry for copying to pupils
	x <- x \|> select(-c("change_type.x", "logical_status", "blpu_state",
	"blpu_state_date", "rpc", "local_custodian_code",
	"country", "start_date.x", "end_date.x",
	"last_update_date.x", "entry_date.x",
	"addressbase_postal", "postcode_locator",
	"multi_occ_count", "change_type.y",
	"postcode_type", "delivery_point_suffix",
	"welsh_dependent_thoroughfare","welsh_thoroughfare",
	"welsh_double_dependent_locality",
	"welsh_dependent_locality",
	"welsh_post_town",
	"process_date", "start_date.y", "end_date.y",
	"last_update_date.y", "entry_date.y","parent_uprn",
	"x_coordinate", "y_coordinate", "latitude", "longitude",
	"udprn"))
	x$uprn <- as.character(x$uprn)
	return(x)
	}

	add_os_ADDRESS_field <- function(x){
	x <- x \|> mutate(ADDRESS =
	str_c(
	str_replace_na(x$organisation_name,
	replacement = ""),
	" ",
	str_replace_na(x$department_name,
	replacement = ""),
	" ",
	str_replace_na(x$po_box_number,
	replacement = ""),
	" ",

	str_replace_na(x$sub_building_name,
	replacement = ""),
	" ",
	str_replace_na(x$building_name,
	replacement = ""),
	" ",
	str_replace_na(x$building_number,
	replacement = ""),
	" ",
	str_replace_na(x$dependent_thoroughfare,
	replacement = ""),
	" ",
	str_replace_na(x$thoroughfare,
	replacement = ""),
	" ",
	str_replace_na(x$double_dependent_locality,
	replacement = ""),
	" ",
	str_replace_na(x$dependent_locality,
	replacement = ""),
	" ",
	str_replace_na(x$post_town,
	replacement = ""),
	" ",
	str_replace_na(x$postcode,
	replacement = "")
	) \|> str_squish()
	)
	}

	add_pupils_ADDRESS_field <- function(x){
	x <- x \|> mutate(FIND =
	str_c(
	str_replace_na(x$addressline1,
	replacement = ""),
	" ",
	str_replace_na(x$addressline2,
	replacement = ""),
	" ",
	str_replace_na(x$addressline3,
	replacement = ""),
	" ",
	str_replace_na(x$addressline4,
	replacement = ""),
	" ",
	str_replace_na(x$addressline5,
	replacement = "")
	) \|> str_squish(),
	FOUND = NA,
	UPRN = NA,
	geometry = NA
	)
	}

	read_wsx_boundary <- function(shape = paste0("D:/data_store/Boundaries/county/",
	"county.shp"),
	buffer = 10000){
	wsx <- read_sf(shape)
	wsx <- wsx \|> st_buffer(dist = buffer) \|> select(geometry)
	}

	create_dictionary <- function(x, var){
	# create dictionary
	# used by create_dictionaries function
	dic <- x \|>
	select({{ var }}, postcode) \|>
	drop_na({{ var }}) \|>
	unique() \|>
	arrange({{ var }}) \|>
	rename(term = {{ var }})
	}

	collapse_dictionaries <- function(x){
	# "dic_sub_building_name",
	# "dic_building_name",
	## "dic_building_number",
	# "dic_thoroughfare",
	# "dic_dependent_thoroughfare",
	# "dic_double_dependent_locality",
	# "dic_post_town",
	# "dic_county"
	x \|> list_rbind()
	}

	create_dictionaries <- function(x){
	# create all dictionaries

	# > names(source_wsx)
	# [1] "organisation_name" "department_name" "sub_building_name" "building_name"
	# [5] "building_number" "dependent_thoroughfare" "thoroughfare" "double_dependent_locality"

	# * building_name
	dic_sub_building_name <- x \|> create_dictionary(sub_building_name) \|>
	arrange() \|> unique()

	# * building_name
	dic_building_name <- x \|> create_dictionary(building_name) \|>
	arrange() \|> unique()

	# * building_number ... really necessary?
	# as.numeric(dictionaries$dic_building_number$term) \|> is.na() \|> table()
	# FALSE
	# 297132
	# dic_building_number <- x \|> create_dictionary(building_number) \|>
	# arrange() \|> unique()

	# * county
	dic_county <- create_dictionaries_counties()

	# * posttown
	dic_post_town <- x \|> create_dictionary(post_town) \|>
	arrange() \|> unique()

	# * locality
	dic_double_dependent_locality <- x \|>
	create_dictionary(double_dependent_locality) \|>
	arrange() \|> unique()

	dic_dependent_locality <- x \|>
	create_dictionary(dependent_locality) \|>
	arrange() \|> unique()

	# combine character vectors and remove subsequent duplicates
	# dic_locality <- c(dic_dependent_locality, dic_double_dependent_locality) \|>
	# sort() \|> unique()
	# dic_locality

	# * street
	dic_thoroughfare <- x \|>
	create_dictionary(thoroughfare) \|>
	arrange() \|> unique()

	dic_dependent_thoroughfare <- x \|>
	create_dictionary(dependent_thoroughfare) \|>
	arrange() \|> unique()

	# dic_street <- c(dic_thoroughfare, dic_dependent_thoroughfare) \|>
	# sort() \|> unique()



	# return list
	return(list(#dic_street = dic_street,
	dic_sub_building_name = dic_sub_building_name,
	dic_building_name = dic_building_name,
	# dic_building_number = dic_building_number,
	dic_thoroughfare = dic_thoroughfare,
	dic_dependent_thoroughfare = dic_dependent_thoroughfare,
	dic_double_dependent_locality = dic_double_dependent_locality,
	# dic_locality = dic_locality,
	dic_post_town = dic_post_town,
	dic_county = dic_county))
	}

	create_dictionaries_counties <- function(
	gpkg_path = "D:/data_store/OS Boundary-Line/bdline_gb_2024-03-26.gpkg"){
	# special case for counties
	county <- st_read(gpkg_path,
	layer = "county") \|>
	st_drop_geometry() \|>
	select(Name)
	county <- county$Name \|> str_replace(" County", "")
	county <- county[!county == "Greater London Authority"]

	# district_borough_unitary
	dbs <- st_read(gpkg_path,
	layer = "district_borough_unitary") \|>
	st_drop_geometry() \|>
	select(Name)
	# dbs$Name <- dbs$Name \|> str_replace(" District (B)", "")
	dbs$Name <- dbs$Name \|> str_replace(" District", "")
	dbs$Name <- dbs$Name \|> str_replace(" London Boro", "")
	dbs$Name <- dbs$Name \|> str_replace("City of ", "")
	dbs <- dbs$Name \|> str_replace(" \\(B\\)", "")
	dbs <- c(dbs, "Bedfordshire", "County of Bedford", "Beds",
	"Berkshire", "County of Berks", "Berks",
	"Buckinghamshire", "County of Buckingham", "Bucks",
	"Cambridgeshire", "County of Cambridge", "Cambs",
	"Cheshire", "County of Chester", "Ches", "Cornwall", "Corn",
	"Cumberland", "Cumb", "Derbyshire", "County of Derby", "Derbys",
	"Derbs", "Devon", "Devonshire", "Dev", "Dorset", "Dorsetshire",
	"Dor", "Durham", "County of Durham", "Co Dur", "Essex",
	"Gloucestershire", "County of Gloucester", "Glos", " Gloucs",
	"Hampshire", "County of Southampton", "Hants",
	"Southamptonshire", "Herefordshire", "County of Hereford", "Here",
	"Heref", "Hertfordshire", "County of Hertford", "Herts",
	"Huntingdonshire", "County of Huntingdon", "Hunts", "Kent",
	"Lancashire", "County of Lancaster", "Lancs",
	"Leicestershire", "County of Leicester", "Leics",
	"Lincolnshire", "County of Lincoln", "Lincs",
	"Middlesex", "Mx", "Middx", "Mddx", "M'sex",
	"Norfolk", "Norf", "Northamptonshire", "County of Northampton",
	"Northants", "Northumberland", "Northumb", "Northd",
	"Nottinghamshire", "County of Nottingham", "Notts",
	"Oxfordshire", "County of Oxford", "Oxon",
	"Rutland", "Rutlandshire", "Rut", "Shropshire", "County of Salop",
	"Shrops", "Salop", "Somerset", "Somersetshire", "Som",
	"Staffordshire", "County of Stafford", "Staffs", " Staf",
	"Suffolk", "Suff", "Surrey", "Sy", "Sussex", "Sx", " Ssx",
	"Warwickshire", "County of Warwick", "Warks", " War", "Warw",
	"Westmorland", "Westm", "Wiltshire", "County of Wilts", "Wilts",
	"Worcestershire", "County of Worcester", "Worcs", "Worsts",
	"Yorkshire", "County of York", "Yorks"
	)
	dbs <- dbs \|> unique() \|> str_to_upper() \|> trimws() \|> sort()
	tibble(term = dbs, postcode = "")
	}

	create_postcodes <- function(x){
	# get a list of all relevant postcodes
	# to filter addresses on and confirm matches
	x$postcode \|> unique() \|> sort()
	}

	rename_pupil_fields <- function(x){
	# Correct field names for pupils
	# from summer2024 and powerBI dashboard:
	x <- x \|> rename(
	"NativeId" = "DfE(7)",
	# "pupilonrolltableid" = "pupilonrolltableid",
	"pupilonrolltableid" = "PupilID",
	# "schoolname" = "School/Academy",
	"schoolname" = "School Name",
	"enrolstatus" = "Enrolment Status",
	# "postcode" = "postcode",
	"postcode" = "Postcode",
	"saon" = "saon",
	"paon" = "paon",
	"street" = "street",
	"locality" = "locality",
	"town" = "town",
	"administrativearea" = "administrativearea",
	"posttown" = "posttown",
	"addressline1" = "addressline1",
	"addressline2" = "addressline2",
	"addressline3" = "addressline3",
	"addressline4" = "addressline4",
	"addressline5" = "addressline5")
	# make NativeId a character vector
	x$NativeId <- as.character(x$NativeId)
	return(x)
	}

	tidy_fields <- function(x){
	# run on both pupil address fields
	# and on OS fields for consistent
	# approach
	# see also original `address_hyphen_to_na`
	x \|>
	str_to_upper() \|>
	str_replace_all("-- SELECT --", "")\|>
	str_replace_all("- ", "") \|>
	str_replace_all(regex( "[^[:alnum:][:blank:][-]]"), "") \|>
	str_squish()
	}

	tidy_fields_na <- function(x){
	# any fields with just an empty string become NA
	# compare x to empty string, NA if TRUE
	na_if(x, "")
	}

	tidy_all_fields <- function(x, start, finish){
	# mapping function for all fields
	x <- x \|> mutate(across({{ start }}:{{ finish }}, tidy_fields))
	x <- x \|> mutate(across({{ start }}:{{ finish }}, tidy_fields_na))
	}

	# NOTE: maybe redundant due to change of direction... delete if so:
	add_fields_pupils <- function(x, y = os_wsx_addresses){
	# [1] "organisation_name" "department_name" "sub_building_name" "building_name"
	# [5] "building_number" "dependent_thoroughfare" "thoroughfare" "double_dependent_locality"
	# [9] "dependent_locality" "post_town" "postcode" "po_box_number"
	tmp <- names(y)
	tmp <- tmp[!tmp == "postcode"]
	tmp <- c(tmp, "easting", "northing")
	x <- x \|> mutate(!!!setNames(rep(NA, length(tmp)), tmp))
	return(x)
	}

	move_addresses_to_addressline_fields <- function(x){
	# take entire dataframe and subset either side
	# simply copy values from saon:town to addressline1:5
	# NOTE: ignores administrativearea and posttown

	# move saon where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	truthy_pupils <- !is.na(x$saon) & is.na(x$addressline1)
	x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$saon
	x[truthy_pupils, ]$saon <- NA

	# move paon where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	## to al1 if NA
	truthy_pupils <- !is.na(x$paon) & is.na(x$addressline1)
	x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$paon
	x[truthy_pupils, ]$paon <- NA
	## to al2 if al1 is full
	truthy_pupils <- !is.na(x$paon) & !is.na(x$addressline1) & is.na(x$addressline2)
	x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$paon
	x[truthy_pupils, ]$paon <- NA

	# move street where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	## to al1 if NA
	truthy_pupils <- !is.na(x$street) & is.na(x$addressline1)
	x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$street
	x[truthy_pupils, ]$street <- NA
	## to al2 if al1 is full
	truthy_pupils <- !is.na(x$street) & !is.na(x$addressline1) & is.na(x$addressline2)
	x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$street
	x[truthy_pupils, ]$street <- NA
	## to al3 if al1 and al2 full
	truthy_pupils <- !is.na(x$street) & !is.na(x$addressline1) & !is.na(x$addressline2) & is.na(x$addressline3)
	x[truthy_pupils, ]$addressline3 <- x[truthy_pupils, ]$street
	x[truthy_pupils, ]$street <- NA

	# move locality where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	## to al1 if NA
	truthy_pupils <- !is.na(x$locality) & is.na(x$addressline1)
	x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$locality
	x[truthy_pupils, ]$locality <- NA
	## to al2 if al1 is full
	truthy_pupils <- !is.na(x$locality) & !is.na(x$addressline1) & is.na(x$addressline2)
	x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$locality
	x[truthy_pupils, ]$locality <- NA
	## to al3 if al1 and al2 full
	truthy_pupils <- !is.na(x$locality) & !is.na(x$addressline1) & !is.na(x$addressline2) & is.na(x$addressline3)
	x[truthy_pupils, ]$addressline3 <- x[truthy_pupils, ]$locality
	x[truthy_pupils, ]$locality <- NA
	## to al4 if al1 and al2 and al3 full
	truthy_pupils <- !is.na(x$locality) & !is.na(x$addressline1) & !is.na(x$addressline2) & !is.na(x$addressline3) & is.na(x$addressline4)
	x[truthy_pupils, ]$addressline4 <- x[truthy_pupils, ]$locality
	x[truthy_pupils, ]$locality <- NA

	# move town where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	## to al1 if NA
	truthy_pupils <- !is.na(x$town) & is.na(x$addressline1)
	x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$town
	x[truthy_pupils, ]$town <- NA
	## to al2 if al1 is full
	truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & is.na(x$addressline2)
	x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$town
	x[truthy_pupils, ]$town <- NA
	## to al3 if al1 and al2 full
	truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & !is.na(x$addressline2) & is.na(x$addressline3)
	x[truthy_pupils, ]$addressline3 <- x[truthy_pupils, ]$town
	x[truthy_pupils, ]$town <- NA
	## to al4 if al1 and al2 and al3 full
	truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & !is.na(x$addressline2) & !is.na(x$addressline3) & is.na(x$addressline4)
	x[truthy_pupils, ]$addressline4 <- x[truthy_pupils, ]$town
	x[truthy_pupils, ]$town <- NA
	## to al5 if al1 and al2 and al3 and al4 full
	truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & !is.na(x$addressline2) & !is.na(x$addressline3) & !is.na(x$addressline4) & is.na(x$addressline5)
	x[truthy_pupils, ]$addressline5 <- x[truthy_pupils, ]$town
	x[truthy_pupils, ]$town <- NA

	truthy_pupils <- !is.na(x$saon) \| !is.na(x$paon) \|
	!is.na(x$street) \| !is.na(x$locality) \|
	!is.na(x$town)
	print(paste0("Observation where saon, paon, street, locality, ",
	"town have not been moved: ",
	x[truthy_pupils, ] \|> nrow()
	)
	)
	x$administrativearea <- NA
	x$posttown <- NA
	return(x)
	}

	drop_columns_where_all_NA <- function(x){
	# drop columns where all is NA
	x <- x \|> select_if(~ !all(is.na(.)))
	}

	drop_rows_where_not_938 <- function(x){
	# drop rows where NativeId not 938****[[]]
	x <- x \|> filter(str_detect(NativeId, "938"))
	}
No results found