Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Regional issues #159

Merged
merged 5 commits into from
Jan 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions R/check_nuts2013.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#' @title Check NUTS region codes that changed with the \code{NUTS2016} definition
#' @description Eurostat mixes \code{NUTS2013} and \code{NUTS2016} geographic
#' label codes in the \code{'geo'} column, which creates time-wise comparativity issues.
#' This function checks if you data is affected by this problem and gives
#' information on what to do.
#' @param dat A Eurostat data frame downloaded with \code{\link{get_eurostat}}
#' @export
#' @author Daniel Antal
#' @return An augmented data frame or a message about potential coding
#' errors. For filtering, it marks \code{'non_EU'} and \coce{'unchanged'}
#' regions. Observations with codes ending on \code{'ZZ'} or \code{'XX'} are
#' removed from the returned data table, because these are non-territorial
#' observations or they are outside of the EU.
#' @importFrom dplyr left_join mutate filter rename mutate_if
#' @examples
#' \dontrun{
#' dat <- eurostat::tgs00026
#' check_nuts2013(dat)
#' }

check_nuts2013 <- function (dat,
return_changed_regions = FALSE ) {

unchanged_regions <- regional_changes_2016 %>%
filter ( change == 'unchanged')

changed_regions <- regional_changes_2016 %>%
filter ( change != 'unchanged')

tmp <- dat %>%
mutate_if ( is.factor, as.character ) %>%
left_join ( regional_changes_2016 %>%
select ( code16, change ) %>%
dplyr::rename ( geo = code16 ),
by = 'geo')

there_are_changes <- FALSE

if ( any (is.na(tmp$change)) ) {
tmp <- dat %>%
mutate_if ( is.factor, as.character ) %>%
left_join ( regional_changes_2016 %>%
select ( code13, change ) %>%
dplyr::rename ( geo = code13 ),
by = 'geo')

there_are_changes <- TRUE
}

eu_country_vector <- eurostat::eu_countries$code
tmp_country_vector <- unique ( substr(tmp$geo, 1, 2) )
not_EU_country_vector <- tmp_country_vector [! tmp_country_vector %in% eu_country_vector]

if ( length(not_EU_country_vector) > 0 ) {
## The correspondence table only covers EU regions.
message ( "Not checking for regional label consistency in non-EU countries\n",
"In this data frame: ", not_EU_country_vector )
}

if ( any( stringr::str_sub(tmp$geo, -2,-1) %in% c('ZZ', 'XX')) ) {

warning ( "Regional codes ending with ZZ or XX are extra-territorial",
"\n to the EU and they are removed from the data frame.")

}

tmp %>%
mutate ( change = ifelse ( geo %in% not_EU_country_vector ,
'not_EU', change )) %>%
filter ( stringr::str_sub(geo, -3,-1) != "ZZZ",
stringr::str_sub(geo, -2,-1) != "ZZ",
stringr::str_sub(geo, -3,-1) != "XXX",
stringr::str_sub(geo, -2,-1) != "XX" ) %>%
mutate_if ( is.factor, as.character )

}
19 changes: 19 additions & 0 deletions R/data_nuts_correspondence.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#' @title Correspondence Table NUTS2013-NUTS2016
#' @description A tidy version of the Eurostat correspondence for
#' NUTS1 and NUTS2 territorial units.
#' @format A data_frame:
#' \describe{
#' \item{code13}{The geographical code of the territory in the NUTS2013 definition}
#' \item{code16}{The geographical code of the territory in the NUTS2016 definition}
#' \item{name}{Name of the territorial unit in the Eurostat database}
#' \item{nuts_level}{Aggregation level, i.e. 0=national, 1,2,3 for smaller regions.}
#' \item{change}{Change with the region, or 'unchanged'}
#' \item{resolution}{How can the comparison made between NUTS2013 and NUTS2016 units made, if possible.}
#' }
#' @source \url{https://ec.europa.eu/eurostat/web/nuts/history},
#' \url{https://ec.europa.eu/eurostat/documents/345175/629341/NUTS2013-NUTS2016.xlsx}
"nuts_correspondence"

#' @rdname nuts_correspondence
"nuts_correspondence"

18 changes: 18 additions & 0 deletions R/data_regional_changes_2016.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#' @title Changes in regional boundaries NUTS2013-NUTS2016
#' @description A comparison of regional boundaries, codes, and explanation for
#' the change in a data frame, based on the Eurostat correspondence table.
#' @format A data_frame:
#' \describe{
#' \item{code13}{The geographical code of the territory in the NUTS2013 definition}
#' \item{code16}{The geographical code of the territory in the NUTS2016 definition}
#' \item{name}{Name of the territorial unit in the Eurostat database}
#' \item{nuts_level}{Aggregation level, i.e. 0=national, 1,2,3 for smaller regions.}
#' \item{change}{Change with the region, or 'unchanged'}
#' }
#' @source \url{https://ec.europa.eu/eurostat/web/nuts/history},
#' \url{https://ec.europa.eu/eurostat/documents/345175/629341/NUTS2013-NUTS2016.xlsx}
"regional_changes_2016"

#' @rdname eu_countries
"regional_changes_2016"

270 changes: 270 additions & 0 deletions R/harmonize_geo_code.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
#' @title Recode geo labels from NUTS2013 to NUTS2016
#' @description Eurostat mixes NUTS2013 and NUTS2016 geographic label codes
#' in the \code{'geo'} column, which creates time-wise comparativity issues.
#' This function recodes the observations where only the coding changed, and
#' marks discontinued regions, and other regions which may or may not be
#' somehow compared to current \code{'NUTS2016'} boundaries.
#' @param dat A Eurostat data frame downloaded with \code{\link{get_eurostat}}.
#' @export
#' @author Daniel Antal
#' @return An augmented and potentially relabelled data frame which
#' contains all formerly \code{'NUTS2013'} definition geo labels in the
#' \code{'NUTS2016'} vocabulary when only the code changed, but the
#' boundary did not. It also contains some information on other geo labels
#' that cannot be brought to the current \code{'NUTS2016'} definition.
#' If not called before, the function will use the helper function
#' \code{\link{check_nuts2013}}
#' @importFrom dplyr left_join mutate filter rename
#' @importFrom stringr str_sub
#' @examples
#' \dontrun{
#' dat <- eurostat::tgs00026 %>%
#' check_nuts2013() %>%
#' harmonize_geo_code()
#'
#' #If check_nuts2013() is not called, the function will call it.
#' dat <- eurostat::tgs00026
#' harmonize_geo_code(dat)
#' }

harmonize_geo_code <- function ( dat ) {

## Check if geo information is present ------------------------------
if (check_dat_input(dat)) {
stop ("There is no 'geo' column in the inserted data. This is an error.") }


if ( ! "change" %in% names ( dat) ) {
dat <- check_nuts2013(dat)

}
unchanged_regions <- regional_changes_2016 %>%
filter ( change == 'unchanged')

changed_regions <- regional_changes_2016 %>%
filter ( change != 'unchanged')

nuts_2016_codes <- unique (regional_changes_2016$code16)
# for easier debugging, this data will be re-assigned in each major
# step as tmp2, tmp3... Debugging is particulary difficult, because
# not only the program code, but the underlying logic may have faults.

tmp_eu_only <- tmp %>%
filter ( change != "not_eu") # leave out non-EU regions.

#Find those codes that are missing from the correct NUTS2016 codes
missing_2016_codes <- nuts_2016_codes [which (! nuts_2016_codes %in% tmp_eu_only$geo )]
missing_2016_codes <- missing_2016_codes [ which (stringr::str_sub(missing_2016_codes, -3, -1) != "ZZZ")]
missing_2016_codes <- missing_2016_codes [ which (stringr::str_sub(missing_2016_codes, -2, -1) != "ZZ")]

#Sort them out by NUTS1 and NUTS2 levels
missing_nuts1_2016 <- missing_2016_codes [ which (nchar(missing_2016_codes) == 3)]
missing_nuts2_2016 <- missing_2016_codes [ which (nchar(missing_2016_codes) == 4)]

# Separating labels that need to be corrected into tmp3 ----------------

correctly_labelled_unchanged <- tmp %>%
filter ( change == 'unchanged' )

tmp_changed <- tmp %>%
filter ( change != 'unchanged')

correctly_labelled_changed <- tmp_changed %>%
filter ( geo %in% changed_regions$code16 )

message ( "There are ", nrow(correctly_labelled_changed), " regions that were changed",
" in the transition to NUTS2016 and\nthe data frame correctly represents their geo codes.")

## Finding incorrectly labelled NUTS1 geo labels --------------------
incorrectlly_labelled_nuts1 <- tmp_changed %>%
filter ( geo %in% changed_regions$code13 ) %>%
filter ( nchar (as.character(geo)) == 3)

n_obsolete_nuts1 <- nrow ( incorrectlly_labelled_nuts1 )

incorrectly_labelled_nuts1_2013 <- incorrectlly_labelled_nuts1 %>%
left_join ( nuts_correspondence %>%
filter ( nuts_level == 1 ) %>%
dplyr::rename ( geo = code13 ) %>%
filter ( !is.na(geo)) %>%
select ( geo, code16, change, resolution ),
by = c('geo', 'change'))

discontinued_nuts1_regions <- incorrectly_labelled_nuts1_2013 %>%
filter ( change == "discontinued")

if ( n_obsolete_nuts1 > 0 ) {

message ( "There are ", nrow ( n_obsolete_nuts1 ),
" observations that have an obsolete NUTS1 geo code." )

message ( "Out of these ", nrow(discontinued_nuts1_regions),
" observations are in discontinued NUTS1 regions which cannot be ",
"\n attributed to the current NUTS2016 boundaries.")
}

incorrectly_labelled_nuts1_2013 <- incorrectly_labelled_nuts1_2013 %>%
filter ( change != "discontinued") %>%
mutate ( problem_code = geo ) %>%
mutate ( geo = code16 )

recoded_nuts1_2013 <- incorrectly_labelled_nuts1_2013 %>%
filter ( change == "recoded")

not_recoded_nuts1_2013 <- incorrectly_labelled_nuts1_2013 %>%
filter ( change != "recoded")


## NUTS1 labels that are missing and which are found ------------------
nuts1_missings <- missing_nuts1_2016 [ which ( missing_nuts1_2016 %in% incorrectly_labelled_nuts1_2013$geo)]

found_nuts1 <- incorrectly_labelled_nuts1_2013 %>%
filter ( geo %in% missing_nuts1_2016 )

if ( n_obsolete_nuts1 > 0 ) {
message ( length(unique(found_nuts1$geo)),
" incorrectly labelled NUTS1 regions could be re-labelled")
}
## Finding incorrectly labelled NUTS2 geo labels --------------------

incorrectlly_labelled_nuts2 <- tmp_changed %>%
filter ( geo %in% changed_regions$code13 ) %>%
filter ( nchar (as.character(geo)) == 4)

n_obsolete_nuts2 <- nrow ( incorrectlly_labelled_nuts2 )

incorrectly_labelled_nuts2_2013 <- incorrectlly_labelled_nuts2 %>%
left_join ( nuts_correspondence %>%
filter ( nuts_level == 2 ) %>%
rename ( geo = code13 ) %>%
filter ( !is.na(geo)) %>%
select ( geo, code16, change, resolution ),
by = c('geo', 'change'))

discontinued_nuts2_regions <- incorrectly_labelled_nuts2_2013 %>%
filter ( change == "discontinued")

if ( n_obsolete_nuts2 > 0 ) {

message ( "There are ", n_obsolete_nuts2,
" observations that have an obsolete NUTS2 geo code." )

message ( "Out of these ", nrow(discontinued_nuts2_regions),
" observations are in discontinued NUTS2 regions which cannot be ",
"\n attributed to the current NUTS2016 boundaries.")
}

incorrectly_labelled_nuts2_2013 <- incorrectly_labelled_nuts2_2013 %>%
filter ( change != "discontinued") %>%
mutate ( problem_code = geo ) %>%
mutate ( geo = code16)

recoded_nuts2_2013 <- incorrectly_labelled_nuts2_2013 %>%
filter ( change == "recoded")

not_recoded_nuts2_2013 <- incorrectly_labelled_nuts2_2013 %>%
filter ( change != "recoded")

if ( n_obsolete_nuts2 > 0 ) {
message ( "There are ", nrow(recoded_nuts2_2013),
" observations in NUTS2 regions that have new geo labels,\n",
"but their boundary did not change. These observations are",
" relabelled to the\nNUTS2016 definition.")
}

found_nuts2 <- recoded_nuts2_2013 %>%
filter ( geo %in% missing_nuts2_2016 )

## If there are no corrections to made at all, return the original data frame ------------
if ( length(unique(found_nuts2$geo)) + length(unique(found_nuts1$geo)) == 0) {
message ( "There is no data found that can be further arranged.\nThe data is returned in its original format.")
return (dat)
}

## If there are changes to be made, make them from here ------------------
join_by <- names ( correctly_labelled_unchanged )
join_by <- join_by [which ( join_by %in% names(correctly_labelled_changed) )]

join_by2 <- names ( correctly_labelled_unchanged )
join_by2 <- join_by2 [which ( join_by2 %in% names(found_nuts1))]

## Add unchanged regions and changed, but correctly labelled ones
so_far_joined <- full_join ( correctly_labelled_unchanged,
correctly_labelled_changed,
by = join_by ) %>%
full_join ( found_nuts1, by = join_by2 )

## Add NUTS1 regions that were recoded, if there are any
if ( nrow(found_nuts1)>0 ) {
join_by3 <- names ( so_far_joined )
join_by3 <- join_by3 [which ( join_by3 %in% names(found_nuts1))]

so_far_joined <- so_far_joined %>%
full_join ( found_nuts2, by = join_by3 )
}


## Add NUTS2 regions that were recoded, if there are any
if ( nrow(found_nuts2)>0 ) {
join_by4 <- names ( so_far_joined )
join_by4 <- join_by3 [which ( join_by4 %in% names(found_nuts2))]

so_far_joined <- so_far_joined %>%
full_join ( found_nuts2, by = join_by4 )
}

not_recoded <- rbind ( not_recoded_nuts2_2013, not_recoded_nuts2_2013 )



## The following geo codes will be changed using rules -------------
additive_rules <- c("FR24",
"FR26","FR43",
"FR23","FR25",
"FR22","FR30",
"FR21","FR41","FR42",
"FR51",
"FR52",
"FR53", "FR61", "FR63",
"FR62", "FR81",
"FR7",
"FR82",
"FR83",
"FRA",
"PL11","PL33",
"PL3",
"PL12",
"IE023", "IE024", "IE025",
"LT00", "LT00A",
"UKM2",
"UKM31", "UKM34", "UKM35", "UKM36",
"UKM24", "UKM32", "UKM33", "UKM37", "UKM38",
"HU102", "HU101"
)

if ( nrow( not_recoded ) > 0 ) {
message ( "There are ", nrow (remaining_eu_data),
" that could not be resolved with relabelling.")
so_far_joined <- rbind ( so_far_joined, not_recoded )

if ( sum ( additive_rules %in% not_recoded$problem_code ) > 0 ) {
message ( "Out of these ", sum ( additive_rules %in% not_recoded$problem_code ),
" boundary changes that can be resolved by additive\ncorrespondence rules.")
}
}

discontinued_nuts_regions <- rbind ( discontinued_nuts1_regions,
discontinued_nuts2_regions)

if ( nrow (discontinued_nuts_regions) > 0 ) {
so_far_joined <- so_far_joined %>%
rbind ( discontinued_nuts_regions %>%
mutate ( problem_code = NA_character_ )
)
}

so_far_joined

}


Loading