Created
February 27, 2026 16:09
-
-
Save szechno/6dcb43bf6f45079744bfbb0f39f931aa to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| `%nin%` <- Negate(`%in%`) | |
| read_and_merge_osaddressbase <- function(folder_date, # = "2024-08" | |
| filename, # = "6581692" | |
| stub = "D:/data_store/OS.ADDRESSBASE/", | |
| ext = ".gpkg"){ | |
| # merge two layers we need so we can intersect by wsx boundary | |
| # where to find the geopackage and its filename | |
| gpkg <- paste0(stub, folder_date, "/", filename, ext) | |
| # add advisory | |
| print(paste("Reading and merging ", gpkg)) | |
| # read two layers we need to merge to make sf | |
| # uprn, geom | |
| os_blpu <- st_read(gpkg, | |
| layer = "blpu") | |
| # uprn, etc... | |
| os_delivery_point_address <- st_read(gpkg, | |
| layer = "delivery_point_address") | |
| os_all_fields <- merge(os_blpu, | |
| os_delivery_point_address, | |
| by.x = "uprn", | |
| by.y = "uprn") | |
| # usrn, etc..., geom | |
| os_street <- st_read(gpkg, | |
| layer = "street") |> | |
| select(usrn, geom) | |
| # usrn, street_description | |
| os_street_descriptor <- st_read(gpkg, | |
| layer = "street_descriptor") |> | |
| select(usrn, street_description) | |
| os_streets <- merge(os_street, | |
| os_street_descriptor, | |
| by.x = "usrn", | |
| by.y = "usrn") | |
| return(list(os_all_fields, os_streets)) | |
| } | |
| remove_os_fields <- function(x){ | |
| # remove fields not required for making dictionaries | |
| # but retain geometry for copying to pupils | |
| x <- x |> select(-c("change_type.x", "logical_status", "blpu_state", | |
| "blpu_state_date", "rpc", "local_custodian_code", | |
| "country", "start_date.x", "end_date.x", | |
| "last_update_date.x", "entry_date.x", | |
| "addressbase_postal", "postcode_locator", | |
| "multi_occ_count", "change_type.y", | |
| "postcode_type", "delivery_point_suffix", | |
| "welsh_dependent_thoroughfare","welsh_thoroughfare", | |
| "welsh_double_dependent_locality", | |
| "welsh_dependent_locality", | |
| "welsh_post_town", | |
| "process_date", "start_date.y", "end_date.y", | |
| "last_update_date.y", "entry_date.y","parent_uprn", | |
| "x_coordinate", "y_coordinate", "latitude", "longitude", | |
| "udprn")) | |
| x$uprn <- as.character(x$uprn) | |
| return(x) | |
| } | |
| add_os_ADDRESS_field <- function(x){ | |
| x <- x |> mutate(ADDRESS = | |
| str_c( | |
| str_replace_na(x$organisation_name, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$department_name, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$po_box_number, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$sub_building_name, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$building_name, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$building_number, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$dependent_thoroughfare, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$thoroughfare, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$double_dependent_locality, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$dependent_locality, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$post_town, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$postcode, | |
| replacement = "") | |
| ) |> str_squish() | |
| ) | |
| } | |
| add_pupils_ADDRESS_field <- function(x){ | |
| x <- x |> mutate(FIND = | |
| str_c( | |
| str_replace_na(x$addressline1, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$addressline2, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$addressline3, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$addressline4, | |
| replacement = ""), | |
| " ", | |
| str_replace_na(x$addressline5, | |
| replacement = "") | |
| ) |> str_squish(), | |
| FOUND = NA, | |
| UPRN = NA, | |
| geometry = NA | |
| ) | |
| } | |
| read_wsx_boundary <- function(shape = paste0("D:/data_store/Boundaries/county/", | |
| "county.shp"), | |
| buffer = 10000){ | |
| wsx <- read_sf(shape) | |
| wsx <- wsx |> st_buffer(dist = buffer) |> select(geometry) | |
| } | |
| create_dictionary <- function(x, var){ | |
| # create dictionary | |
| # used by create_dictionaries function | |
| dic <- x |> | |
| select({{ var }}, postcode) |> | |
| drop_na({{ var }}) |> | |
| unique() |> | |
| arrange({{ var }}) |> | |
| rename(term = {{ var }}) | |
| } | |
| collapse_dictionaries <- function(x){ | |
| # "dic_sub_building_name", | |
| # "dic_building_name", | |
| ## "dic_building_number", | |
| # "dic_thoroughfare", | |
| # "dic_dependent_thoroughfare", | |
| # "dic_double_dependent_locality", | |
| # "dic_post_town", | |
| # "dic_county" | |
| x |> list_rbind() | |
| } | |
| create_dictionaries <- function(x){ | |
| # create all dictionaries | |
| # > names(source_wsx) | |
| # [1] "organisation_name" "department_name" "sub_building_name" "building_name" | |
| # [5] "building_number" "dependent_thoroughfare" "thoroughfare" "double_dependent_locality" | |
| # * building_name | |
| dic_sub_building_name <- x |> create_dictionary(sub_building_name) |> | |
| arrange() |> unique() | |
| # * building_name | |
| dic_building_name <- x |> create_dictionary(building_name) |> | |
| arrange() |> unique() | |
| # * building_number ... really necessary? | |
| # as.numeric(dictionaries$dic_building_number$term) |> is.na() |> table() | |
| # FALSE | |
| # 297132 | |
| # dic_building_number <- x |> create_dictionary(building_number) |> | |
| # arrange() |> unique() | |
| # * county | |
| dic_county <- create_dictionaries_counties() | |
| # * posttown | |
| dic_post_town <- x |> create_dictionary(post_town) |> | |
| arrange() |> unique() | |
| # * locality | |
| dic_double_dependent_locality <- x |> | |
| create_dictionary(double_dependent_locality) |> | |
| arrange() |> unique() | |
| dic_dependent_locality <- x |> | |
| create_dictionary(dependent_locality) |> | |
| arrange() |> unique() | |
| # combine character vectors and remove subsequent duplicates | |
| # dic_locality <- c(dic_dependent_locality, dic_double_dependent_locality) |> | |
| # sort() |> unique() | |
| # dic_locality | |
| # * street | |
| dic_thoroughfare <- x |> | |
| create_dictionary(thoroughfare) |> | |
| arrange() |> unique() | |
| dic_dependent_thoroughfare <- x |> | |
| create_dictionary(dependent_thoroughfare) |> | |
| arrange() |> unique() | |
| # dic_street <- c(dic_thoroughfare, dic_dependent_thoroughfare) |> | |
| # sort() |> unique() | |
| # return list | |
| return(list(#dic_street = dic_street, | |
| dic_sub_building_name = dic_sub_building_name, | |
| dic_building_name = dic_building_name, | |
| # dic_building_number = dic_building_number, | |
| dic_thoroughfare = dic_thoroughfare, | |
| dic_dependent_thoroughfare = dic_dependent_thoroughfare, | |
| dic_double_dependent_locality = dic_double_dependent_locality, | |
| # dic_locality = dic_locality, | |
| dic_post_town = dic_post_town, | |
| dic_county = dic_county)) | |
| } | |
| create_dictionaries_counties <- function( | |
| gpkg_path = "D:/data_store/OS Boundary-Line/bdline_gb_2024-03-26.gpkg"){ | |
| # special case for counties | |
| county <- st_read(gpkg_path, | |
| layer = "county") |> | |
| st_drop_geometry() |> | |
| select(Name) | |
| county <- county$Name |> str_replace(" County", "") | |
| county <- county[!county == "Greater London Authority"] | |
| # district_borough_unitary | |
| dbs <- st_read(gpkg_path, | |
| layer = "district_borough_unitary") |> | |
| st_drop_geometry() |> | |
| select(Name) | |
| # dbs$Name <- dbs$Name |> str_replace(" District (B)", "") | |
| dbs$Name <- dbs$Name |> str_replace(" District", "") | |
| dbs$Name <- dbs$Name |> str_replace(" London Boro", "") | |
| dbs$Name <- dbs$Name |> str_replace("City of ", "") | |
| dbs <- dbs$Name |> str_replace(" \\(B\\)", "") | |
| dbs <- c(dbs, "Bedfordshire", "County of Bedford", "Beds", | |
| "Berkshire", "County of Berks", "Berks", | |
| "Buckinghamshire", "County of Buckingham", "Bucks", | |
| "Cambridgeshire", "County of Cambridge", "Cambs", | |
| "Cheshire", "County of Chester", "Ches", "Cornwall", "Corn", | |
| "Cumberland", "Cumb", "Derbyshire", "County of Derby", "Derbys", | |
| "Derbs", "Devon", "Devonshire", "Dev", "Dorset", "Dorsetshire", | |
| "Dor", "Durham", "County of Durham", "Co Dur", "Essex", | |
| "Gloucestershire", "County of Gloucester", "Glos", " Gloucs", | |
| "Hampshire", "County of Southampton", "Hants", | |
| "Southamptonshire", "Herefordshire", "County of Hereford", "Here", | |
| "Heref", "Hertfordshire", "County of Hertford", "Herts", | |
| "Huntingdonshire", "County of Huntingdon", "Hunts", "Kent", | |
| "Lancashire", "County of Lancaster", "Lancs", | |
| "Leicestershire", "County of Leicester", "Leics", | |
| "Lincolnshire", "County of Lincoln", "Lincs", | |
| "Middlesex", "Mx", "Middx", "Mddx", "M'sex", | |
| "Norfolk", "Norf", "Northamptonshire", "County of Northampton", | |
| "Northants", "Northumberland", "Northumb", "Northd", | |
| "Nottinghamshire", "County of Nottingham", "Notts", | |
| "Oxfordshire", "County of Oxford", "Oxon", | |
| "Rutland", "Rutlandshire", "Rut", "Shropshire", "County of Salop", | |
| "Shrops", "Salop", "Somerset", "Somersetshire", "Som", | |
| "Staffordshire", "County of Stafford", "Staffs", " Staf", | |
| "Suffolk", "Suff", "Surrey", "Sy", "Sussex", "Sx", " Ssx", | |
| "Warwickshire", "County of Warwick", "Warks", " War", "Warw", | |
| "Westmorland", "Westm", "Wiltshire", "County of Wilts", "Wilts", | |
| "Worcestershire", "County of Worcester", "Worcs", "Worsts", | |
| "Yorkshire", "County of York", "Yorks" | |
| ) | |
| dbs <- dbs |> unique() |> str_to_upper() |> trimws() |> sort() | |
| tibble(term = dbs, postcode = "") | |
| } | |
| create_postcodes <- function(x){ | |
| # get a list of all relevant postcodes | |
| # to filter addresses on and confirm matches | |
| x$postcode |> unique() |> sort() | |
| } | |
| rename_pupil_fields <- function(x){ | |
| # Correct field names for pupils | |
| # from summer2024 and powerBI dashboard: | |
| x <- x |> rename( | |
| "NativeId" = "DfE(7)", | |
| # "pupilonrolltableid" = "pupilonrolltableid", | |
| "pupilonrolltableid" = "PupilID", | |
| # "schoolname" = "School/Academy", | |
| "schoolname" = "School Name", | |
| "enrolstatus" = "Enrolment Status", | |
| # "postcode" = "postcode", | |
| "postcode" = "Postcode", | |
| "saon" = "saon", | |
| "paon" = "paon", | |
| "street" = "street", | |
| "locality" = "locality", | |
| "town" = "town", | |
| "administrativearea" = "administrativearea", | |
| "posttown" = "posttown", | |
| "addressline1" = "addressline1", | |
| "addressline2" = "addressline2", | |
| "addressline3" = "addressline3", | |
| "addressline4" = "addressline4", | |
| "addressline5" = "addressline5") | |
| # make NativeId a character vector | |
| x$NativeId <- as.character(x$NativeId) | |
| return(x) | |
| } | |
| tidy_fields <- function(x){ | |
| # run on both pupil address fields | |
| # and on OS fields for consistent | |
| # approach | |
| # see also original `address_hyphen_to_na` | |
| x |> | |
| str_to_upper() |> | |
| str_replace_all("-- SELECT --", "")|> | |
| str_replace_all("- ", "") |> | |
| str_replace_all(regex( "[^[:alnum:][:blank:][-]]"), "") |> | |
| str_squish() | |
| } | |
| tidy_fields_na <- function(x){ | |
| # any fields with just an empty string become NA | |
| # compare x to empty string, NA if TRUE | |
| na_if(x, "") | |
| } | |
| tidy_all_fields <- function(x, start, finish){ | |
| # mapping function for all fields | |
| x <- x |> mutate(across({{ start }}:{{ finish }}, tidy_fields)) | |
| x <- x |> mutate(across({{ start }}:{{ finish }}, tidy_fields_na)) | |
| } | |
| # NOTE: maybe redundant due to change of direction... delete if so: | |
| add_fields_pupils <- function(x, y = os_wsx_addresses){ | |
| # [1] "organisation_name" "department_name" "sub_building_name" "building_name" | |
| # [5] "building_number" "dependent_thoroughfare" "thoroughfare" "double_dependent_locality" | |
| # [9] "dependent_locality" "post_town" "postcode" "po_box_number" | |
| tmp <- names(y) | |
| tmp <- tmp[!tmp == "postcode"] | |
| tmp <- c(tmp, "easting", "northing") | |
| x <- x |> mutate(!!!setNames(rep(NA, length(tmp)), tmp)) | |
| return(x) | |
| } | |
| move_addresses_to_addressline_fields <- function(x){ | |
| # take entire dataframe and subset either side | |
| # simply copy values from saon:town to addressline1:5 | |
| # NOTE: ignores administrativearea and posttown | |
| # move saon where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
| truthy_pupils <- !is.na(x$saon) & is.na(x$addressline1) | |
| x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$saon | |
| x[truthy_pupils, ]$saon <- NA | |
| # move paon where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
| ## to al1 if NA | |
| truthy_pupils <- !is.na(x$paon) & is.na(x$addressline1) | |
| x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$paon | |
| x[truthy_pupils, ]$paon <- NA | |
| ## to al2 if al1 is full | |
| truthy_pupils <- !is.na(x$paon) & !is.na(x$addressline1) & is.na(x$addressline2) | |
| x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$paon | |
| x[truthy_pupils, ]$paon <- NA | |
| # move street where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
| ## to al1 if NA | |
| truthy_pupils <- !is.na(x$street) & is.na(x$addressline1) | |
| x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$street | |
| x[truthy_pupils, ]$street <- NA | |
| ## to al2 if al1 is full | |
| truthy_pupils <- !is.na(x$street) & !is.na(x$addressline1) & is.na(x$addressline2) | |
| x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$street | |
| x[truthy_pupils, ]$street <- NA | |
| ## to al3 if al1 and al2 full | |
| truthy_pupils <- !is.na(x$street) & !is.na(x$addressline1) & !is.na(x$addressline2) & is.na(x$addressline3) | |
| x[truthy_pupils, ]$addressline3 <- x[truthy_pupils, ]$street | |
| x[truthy_pupils, ]$street <- NA | |
| # move locality where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
| ## to al1 if NA | |
| truthy_pupils <- !is.na(x$locality) & is.na(x$addressline1) | |
| x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$locality | |
| x[truthy_pupils, ]$locality <- NA | |
| ## to al2 if al1 is full | |
| truthy_pupils <- !is.na(x$locality) & !is.na(x$addressline1) & is.na(x$addressline2) | |
| x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$locality | |
| x[truthy_pupils, ]$locality <- NA | |
| ## to al3 if al1 and al2 full | |
| truthy_pupils <- !is.na(x$locality) & !is.na(x$addressline1) & !is.na(x$addressline2) & is.na(x$addressline3) | |
| x[truthy_pupils, ]$addressline3 <- x[truthy_pupils, ]$locality | |
| x[truthy_pupils, ]$locality <- NA | |
| ## to al4 if al1 and al2 and al3 full | |
| truthy_pupils <- !is.na(x$locality) & !is.na(x$addressline1) & !is.na(x$addressline2) & !is.na(x$addressline3) & is.na(x$addressline4) | |
| x[truthy_pupils, ]$addressline4 <- x[truthy_pupils, ]$locality | |
| x[truthy_pupils, ]$locality <- NA | |
| # move town where not NA ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
| ## to al1 if NA | |
| truthy_pupils <- !is.na(x$town) & is.na(x$addressline1) | |
| x[truthy_pupils, ]$addressline1 <- x[truthy_pupils, ]$town | |
| x[truthy_pupils, ]$town <- NA | |
| ## to al2 if al1 is full | |
| truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & is.na(x$addressline2) | |
| x[truthy_pupils, ]$addressline2 <- x[truthy_pupils, ]$town | |
| x[truthy_pupils, ]$town <- NA | |
| ## to al3 if al1 and al2 full | |
| truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & !is.na(x$addressline2) & is.na(x$addressline3) | |
| x[truthy_pupils, ]$addressline3 <- x[truthy_pupils, ]$town | |
| x[truthy_pupils, ]$town <- NA | |
| ## to al4 if al1 and al2 and al3 full | |
| truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & !is.na(x$addressline2) & !is.na(x$addressline3) & is.na(x$addressline4) | |
| x[truthy_pupils, ]$addressline4 <- x[truthy_pupils, ]$town | |
| x[truthy_pupils, ]$town <- NA | |
| ## to al5 if al1 and al2 and al3 and al4 full | |
| truthy_pupils <- !is.na(x$town) & !is.na(x$addressline1) & !is.na(x$addressline2) & !is.na(x$addressline3) & !is.na(x$addressline4) & is.na(x$addressline5) | |
| x[truthy_pupils, ]$addressline5 <- x[truthy_pupils, ]$town | |
| x[truthy_pupils, ]$town <- NA | |
| truthy_pupils <- !is.na(x$saon) | !is.na(x$paon) | | |
| !is.na(x$street) | !is.na(x$locality) | | |
| !is.na(x$town) | |
| print(paste0("Observation where saon, paon, street, locality, ", | |
| "town have not been moved: ", | |
| x[truthy_pupils, ] |> nrow() | |
| ) | |
| ) | |
| x$administrativearea <- NA | |
| x$posttown <- NA | |
| return(x) | |
| } | |
| drop_columns_where_all_NA <- function(x){ | |
| # drop columns where all is NA | |
| x <- x |> select_if(~ !all(is.na(.))) | |
| } | |
| drop_rows_where_not_938 <- function(x){ | |
| # drop rows where NativeId not 938****[[]] | |
| x <- x |> filter(str_detect(NativeId, "938")) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment