Overview

This vignette outlines the data linkage process used to link various packaged - and potentially user supplied - datasets. Further details are provided in the linked function documentation.

Linkage

The datasets outlined in the data sources vignette are linked using country code (ISO3) where available and country name (using an automated approach with manual checks). This results in a single dataset which includes data on zoonotic TB and various correlated variables. Data completeness is low with some datasets spanning multiple decades (from the 1950’s) and some representing only a single year of data. Default datasets can be updated by passing datasets in the required format to arguments of link_data (see ?link_data for details).

Code

EstZoonoticTB::link_data
#> function (z_tb_humans = NULL, tb_humans = NULL, z_tb_animals = NULL, 
#>     demo = NULL, animal_demo = NULL, verbose = TRUE) 
#> {
#>     cases <- NULL
#>     cattle <- NULL
#>     country <- NULL
#>     country.x <- NULL
#>     country.y <- NULL
#>     country_code <- NULL
#>     dirty_id <- NULL
#>     geo_coverage <- NULL
#>     id <- NULL
#>     iso3 <- NULL
#>     multi_year_study <- NULL
#>     population <- NULL
#>     sample_size <- NULL
#>     sampling_strat <- NULL
#>     study_end <- NULL
#>     study_id <- NULL
#>     study_period <- NULL
#>     study_pop <- NULL
#>     dom <- NULL
#>     wild <- NULL
#>     g_whoregion <- NULL
#>     if (is.null(z_tb_humans)) {
#>         z_tb_humans <- EstZoonoticTB::zoonotic_tb_humans %>% 
#>             dplyr::mutate(country = as.character(country))
#>     }
#>     if (is.null(tb_humans)) {
#>         tb_humans <- EstZoonoticTB::tb_data(verbose = FALSE) %>% 
#>             dplyr::mutate(country = as.character(country))
#>     }
#>     if (is.null(z_tb_animals)) {
#>         z_tb_animals <- EstZoonoticTB::zoonotic_tb_animals %>% 
#>             dplyr::mutate(country = as.character(country))
#>     }
#>     if (is.null(demo)) {
#>         demo <- EstZoonoticTB::demographics %>% dplyr::mutate(country = as.character(country))
#>     }
#>     if (is.null(animal_demo)) {
#>         animal_demo <- EstZoonoticTB::animal_demographics %>% 
#>             dplyr::mutate(country = as.character(country))
#>     }
#>     if (verbose) {
#>         message("Joining human and animal demographic data.")
#>         message("Countries with data present for demographics and not animal demographics:")
#>         print(setdiff(demo$country, animal_demo$country))
#>         message("Countries with data present for animal demographics and not demographics:")
#>         print(setdiff(animal_demo$country, demo$country))
#>     }
#>     demo <- suppressWarnings(demo %>% dplyr::full_join(animal_demo, 
#>         by = c("country", "year", "country_code")))
#>     demo <- demo %>% dplyr::mutate(cattle_per_head = cattle/population)
#>     if (verbose) {
#>         message("Joining TB incidence in humans data and zTB presence in animals data using country codes.")
#>         message("Countries with data present for TB incidence and not zTB in animals:")
#>         tb_not_animal_tb <- tb_humans %>% dplyr::filter(iso3 %in% 
#>             setdiff(tb_humans$iso3, z_tb_animals$country_code)) %>% 
#>             dplyr::pull(country) %>% as.character %>% unique
#>         print(tb_not_animal_tb)
#>         message("Countries with data present for zTB in animals and not TB incidence:")
#>         animal_tb_not_tb <- z_tb_animals %>% dplyr::filter(country_code %in% 
#>             setdiff(z_tb_animals$country_code, tb_humans$iso3)) %>% 
#>             dplyr::pull(country) %>% as.character %>% unique
#>         print(animal_tb_not_tb)
#>     }
#>     joined_tb <- suppressWarnings(tb_humans %>% dplyr::rename(country_code = iso3) %>% 
#>         dplyr::mutate(country_code = as.character(country_code)) %>% 
#>         dplyr::full_join(z_tb_animals %>% dplyr::mutate(country_code = as.character(country_code)), 
#>             by = c("country_code", "year"))) %>% dplyr::mutate(country = ifelse(!is.na(country.x), 
#>         as.character(country.x), as.character(country.y))) %>% 
#>         dplyr::select(-country.x, -country.y) %>% dplyr::select(country, 
#>         tidyselect::everything()) %>% dplyr::rename(z_tb_dom_animal = dom, 
#>         z_tb_wild_animal = wild)
#>     if (verbose) {
#>         message("Joining zTB incidence in humans data and all other TB data using country names")
#>     }
#>     z_tb_not_other_tb <- z_tb_humans %>% dplyr::filter(country %in% 
#>         setdiff(z_tb_humans$country, joined_tb$country)) %>% 
#>         dplyr::pull(country) %>% as.character %>% unique
#>     if (length(z_tb_not_other_tb) > 0) {
#>         message("The following countries in the zTB dataset do not match with countries in other TB datasets: ", 
#>             paste(z_tb_not_other_tb, collapse = ", "), "\n Adjust data inputs and rerun this function.")
#>     }
#>     z_tb_humans <- z_tb_humans %>% dplyr::rename(year = study_end) %>% 
#>         dplyr::rename_at(.vars = dplyr::vars(id, geo_coverage, 
#>             study_pop, multi_year_study), ~paste0("z_tb_", .)) %>% 
#>         dplyr::select(-study_id, -dirty_id, -cases, -sample_size, 
#>             -study_period, -sampling_strat) %>% dplyr::mutate(country = as.character(country))
#>     z_tb_humans <- z_tb_humans %>% dplyr::full_join(joined_tb %>% 
#>         dplyr::select(country, country_code, g_whoregion) %>% 
#>         unique, by = c("country"))
#>     joined_tb <- suppressWarnings(joined_tb %>% dplyr::full_join(z_tb_humans, 
#>         by = c("country", "country_code", "year", "g_whoregion")))
#>     if (verbose) {
#>         message("Joining TB data and demographic data using country names.")
#>         message("Countries with data present for TB not for demographics:")
#>         print(setdiff(unique(joined_tb$country), unique(demo$country)))
#>         message("Countries with data present demographics and not TB:")
#>         message("(Some mismatches are to be expected here due to historic country names)")
#>         print(setdiff(unique(demo$country), unique(joined_tb$country)))
#>     }
#>     demo <- demo %>% dplyr::select(-country_code) %>% dplyr::full_join(joined_tb %>% 
#>         dplyr::select(country, country_code, g_whoregion) %>% 
#>         unique, by = c("country"))
#>     out <- joined_tb %>% dplyr::full_join(demo, by = c("country", 
#>         "country_code", "year", "g_whoregion")) %>% dplyr::mutate_if(is.character, 
#>         factor)
#>     return(out)
#> }
#> <bytecode: 0x51e8c28>
#> <environment: namespace:EstZoonoticTB>

Data

all_data <- EstZoonoticTB::link_data(verbose = TRUE)
#> Joining human and animal demographic data.
#> Countries with data present for demographics and not animal demographics:
#>  [1] "Andorra"                  "Anguilla"                
#>  [3] "Aruba"                    "Channel Islands"         
#>  [5] "China, Macao SAR"         "Gibraltar"               
#>  [7] "Holy See"                 "Isle of Man"             
#>  [9] "Kiribati"                 "Maldives"                
#> [11] "Marshall Islands"         "Mayotte"                 
#> [13] "Monaco"                   "Nauru"                   
#> [15] "Northern Mariana Islands" "Palau"                   
#> [17] "San Marino"               "Tokelau"                 
#> [19] "Turks and Caicos Islands" "Tuvalu"
#> Countries with data present for animal demographics and not demographics:
#> [1] "Norfolk Island"
#> Joining TB incidence in humans data and zTB presence in animals data using country codes.
#> Countries with data present for TB incidence and not zTB in animals:
#>  [1] "American Samoa"                       
#>  [2] "Anguilla"                             
#>  [3] "Antigua and Barbuda"                  
#>  [4] "Aruba"                                
#>  [5] "Bahamas"                              
#>  [6] "Benin"                                
#>  [7] "Bermuda"                              
#>  [8] "Bonaire, Saint Eustatius and Saba"    
#>  [9] "British Virgin Islands"               
#> [10] "Cameroon"                             
#> [11] "Cayman Islands"                       
#> [12] "China, Macao SAR"                     
#> [13] "Curaçao"                              
#> [14] "Democratic People's Republic of Korea"
#> [15] "Dominica"                             
#> [16] "Equatorial Guinea"                    
#> [17] "Gabon"                                
#> [18] "Grenada"                              
#> [19] "Guam"                                 
#> [20] "Guatemala"                            
#> [21] "Lebanon"                              
#> [22] "Luxembourg"                           
#> [23] "Maldives"                             
#> [24] "Monaco"                               
#> [25] "Montserrat"                           
#> [26] "Nauru"                                
#> [27] "Niue"                                 
#> [28] "Northern Mariana Islands"             
#> [29] "Puerto Rico"                          
#> [30] "Rwanda"                               
#> [31] "Saint Kitts and Nevis"                
#> [32] "Seychelles"                           
#> [33] "Sint Maarten (Dutch part)"            
#> [34] "Solomon Islands"                      
#> [35] "Tajikistan"                           
#> [36] "Togo"                                 
#> [37] "Tokelau"                              
#> [38] "Turks and Caicos Islands"             
#> [39] "Tuvalu"                               
#> [40] "Wallis and Futuna Islands"            
#> [41] "Yemen"                                
#> [42] "Netherlands Antilles"                 
#> [43] "Serbia & Montenegro"
#> Countries with data present for zTB in animals and not TB incidence:
#>  [1] "Ceuta"                       "Chinese Taipei"             
#>  [3] "Falkland Islands (Malvinas)" "Faroe Islands"              
#>  [5] "French Guiana"               "Guadeloupe (France)"        
#>  [7] "Liechtenstein"               "Martinique"                 
#>  [9] "Mayotte (France)"            "Melilla"                    
#> [11] "Reunion"                     "St. Helena"
#> Joining zTB incidence in humans data and all other TB data using country names
#> Joining TB data and demographic data using country names.
#> Countries with data present for TB not for demographics:
#>  [1] "Bonaire, Saint Eustatius and Saba"
#>  [2] "Curaçao"                          
#>  [3] "Sint Maarten (Dutch part)"        
#>  [4] "West Bank and Gaza Strip"         
#>  [5] "Netherlands Antilles"             
#>  [6] "Serbia & Montenegro"              
#>  [7] "Ceuta"                            
#>  [8] "Chinese Taipei"                   
#>  [9] "Guadeloupe (France)"              
#> [10] "Mayotte (France)"                 
#> [11] "Melilla"                          
#> [12] "St. Helena"
#> Countries with data present demographics and not TB:
#> (Some mismatches are to be expected here due to historic country names)
#>  [1] "Belgium-Luxembourg"              "Channel Islands"                
#>  [3] "Czechoslovakia"                  "Ethiopia PDR"                   
#>  [5] "Gibraltar"                       "Guadeloupe"                     
#>  [7] "Holy See"                        "Isle of Man"                    
#>  [9] "Mayotte"                         NA                               
#> [11] "Pacific Islands Trust Territory" "Palestine"                      
#> [13] "Saint Pierre and Miquelon"       "Sudan (former)"                 
#> [15] "United States Virgin Islands"    "USSR"                           
#> [17] "Western Sahara"                  "Yugoslav SFR"                   
#> [19] "Norfolk Island"

all_data
#> # A tibble: 15,465 x 26
#>    country country_code g_whoregion  year tb_cases tb_inc tb_inc_lo
#>    <fct>   <fct>        <fct>       <dbl>    <int>  <dbl>     <dbl>
#>  1 Afghan… AFG          Eastern Me…  2000    39000    190       123
#>  2 Afghan… AFG          Eastern Me…  2001    41000    189       123
#>  3 Afghan… AFG          Eastern Me…  2002    43000    189       122
#>  4 Afghan… AFG          Eastern Me…  2003    45000    189       122
#>  5 Afghan… AFG          Eastern Me…  2004    47000    189       122
#>  6 Afghan… AFG          Eastern Me…  2005    48000    189       122
#>  7 Afghan… AFG          Eastern Me…  2006    50000    189       122
#>  8 Afghan… AFG          Eastern Me…  2007    51000    189       122
#>  9 Afghan… AFG          Eastern Me…  2008    52000    189       122
#> 10 Afghan… AFG          Eastern Me…  2009    54000    189       123
#> # … with 15,455 more rows, and 19 more variables: tb_inc_hi <dbl>,
#> #   prop_tb_ep <dbl>, prop_hiv <dbl>, prop_hiv_lo <dbl>,
#> #   prop_hiv_hi <dbl>, z_tb_dom_animal <fct>, z_tb_wild_animal <fct>,
#> #   z_tb_id <int>, z_tb_geo_coverage <fct>, z_tb_study_pop <fct>,
#> #   z_tb_multi_year_study <fct>, tb_z_prop <dbl>, tb_z_prop_lo <dbl>,
#> #   tb_z_prop_hi <dbl>, tb_z_prop_se <dbl>, population <dbl>,
#> #   prop_rural <dbl>, cattle <int>, cattle_per_head <dbl>

Latest data

As many of the packaged datasets are highly missing a more useful dataset for analysis may be one that includes the most recent estimate in each country for each variable. This can be extracted using get_latest_combined_data.

Code

EstZoonoticTB::get_latest_combined_data
#> function (data) 
#> {
#>     cattle_per_head <- NULL
#>     country <- NULL
#>     country_code <- NULL
#>     desc <- NULL
#>     g_whoregion <- NULL
#>     population <- NULL
#>     prop_hiv <- NULL
#>     prop_hiv_hi <- NULL
#>     prop_hiv_lo <- NULL
#>     prop_rural <- NULL
#>     prop_tb_ep <- NULL
#>     tb_cases <- NULL
#>     tb_inc <- NULL
#>     tb_inc_hi <- NULL
#>     tb_inc_lo <- NULL
#>     tb_z_prop <- NULL
#>     tb_z_prop_hi <- NULL
#>     tb_z_prop_lo <- NULL
#>     tb_z_prop_se <- NULL
#>     year <- NULL
#>     z_tb_dom_animal <- NULL
#>     z_tb_geo_coverage <- NULL
#>     z_tb_id <- NULL
#>     z_tb_multi_year_study <- NULL
#>     z_tb_study_pop <- NULL
#>     z_tb_wild_animal <- NULL
#>     z_tb <- get_latest_data(data, tb_z_prop) %>% dplyr::select(country, 
#>         country_code, z_tb_year = year, z_tb_id, z_tb_geo_coverage, 
#>         z_tb_study_pop, z_tb_multi_year_study, tb_z_prop, tb_z_prop_lo, 
#>         tb_z_prop_hi, tb_z_prop_se)
#>     z_tb_animals <- get_latest_data(data, z_tb_dom_animal) %>% 
#>         dplyr::select(country, country_code, z_tb_animal_year = year, 
#>             z_tb_dom_animal, z_tb_wild_animal)
#>     tb <- get_latest_data(data, tb_inc) %>% dplyr::select(country, 
#>         country_code, g_whoregion, tb_year = year, tb_cases, 
#>         tb_inc, tb_inc_lo, tb_inc_hi, prop_tb_ep, prop_hiv, prop_hiv_lo, 
#>         prop_hiv_hi)
#>     cattle <- get_latest_data(data, cattle) %>% dplyr::select(country, 
#>         country_code, animal_year = year, cattle, cattle_per_head)
#>     demo <- get_latest_data(data, population) %>% dplyr::select(country, 
#>         country_code, demo_year = year, population, prop_rural)
#>     out <- tb %>% dplyr::full_join(z_tb, by = c("country", "country_code")) %>% 
#>         dplyr::full_join(z_tb_animals, by = c("country", "country_code")) %>% 
#>         dplyr::full_join(demo, by = c("country", "country_code")) %>% 
#>         dplyr::full_join(cattle, by = c("country", "country_code"))
#>     return(out)
#> }
#> <bytecode: 0xcc5e080>
#> <environment: namespace:EstZoonoticTB>


EstZoonoticTB::get_latest_data
#> function (data, variable = NULL) 
#> {
#>     country <- NULL
#>     year <- NULL
#>     desc <- NULL
#>     variable <- rlang::enquo(variable)
#>     data %>% dplyr::mutate(country = country %>% as.character) %>% 
#>         dplyr::filter(!is.na(!!variable)) %>% dplyr::group_by(country) %>% 
#>         dplyr::arrange(desc(year)) %>% dplyr::slice(1) %>% dplyr::ungroup()
#> }
#> <bytecode: 0xccc7338>
#> <environment: namespace:EstZoonoticTB>