library(dplyr) library(purrr) library(tidyr) library(stringr) library(readr) library(readxl) library(fs) # library(here) library(lubridate) library(forcats) DATA_DIR <- "data-raw" pop <- list() import_pre1980 <- function(file) { # cli::cat_bullet(file) year <- str_extract(file, "\\d{4}\\.csv") year <- as.integer(sub("\\.csv", "", year)) skip <- if (year >= 1960) 8 else 9 x <- read_csv(file, col_names = FALSE, skip = skip, col_types = cols_only( X1 = col_guess(), X2 = col_guess(), X3 = col_guess(), X4 = col_guess() )) colnames(x) <- c("Age", "Total", "Total Male", "Total Female") x <- x[1:max(which(x$Age %in% c("75+", "85+"))), ] stopifnot(nrow(x) %in% c(76L, 86L)) x %>% mutate( Year = year, Age = sub("\\+", "", Age), Age = as.integer(Age) ) %>% select(Year, everything()) } pop[["1900-1980"]] <- fs::dir_ls( here::here(DATA_DIR, "1900-1980") ) %>% map_dfr(import_pre1980) import_1980_1990 <- function(file) { read_fwf( file, col_positions = fwf_positions( start = c(3, 5, 7, 12, 22, 32), end = c(4, 6, 9, 20, 30, 40), col_names = c("Month", "Year", "Age", "Total", "Total Male", "Total Female") ), skip = 0 ) %>% filter(Age < 101, Month == 4, Year == min(Year)) %>% mutate(Year = Year + 1900) } pop[["1980-1990"]] <- fs::dir_ls( here::here(DATA_DIR, "1980-1990"), regexp = "TXT" ) %>% map_dfr(import_1980_1990) import_1990_2000 <- function(file) { x <- read_csv(file, col_names = FALSE, skip = 3) colnames(x) <- c("Month", "Age", "Total", "Total Male", "Total Female") x %>% filter( !is.na(Total), !str_detect(Age, "All") ) %>% mutate(Month = lubridate::parse_date_time(Month, "mdy")) %>% filter( lubridate::month(Month) == 4 ) %>% mutate( Year = lubridate::year(Month), Age = str_remove(Age, "\\+"), Age = as.integer(Age) ) %>% select(Year, everything(), -Month) } pop[["1990-2000"]] <- fs::dir_ls( here::here(DATA_DIR, "1990-2000"), regexp = "\\d{4}\\.csv" ) %>% map_dfr(import_1990_2000) # pop[["2000-2010"]] <- read_csv( # here::here(DATA_DIR, "2000-2010/us-est00int-alldata.csv") # ) %>% # select(Month = MONTH, Year = YEAR, Age = AGE, # Total = TOT_POP, `Total Male` = TOT_MALE, # `Total Female` = TOT_FEMALE) %>% # filter(Year > 2000, Age <= 125) # pop[["2000-2010"]] <- readxl::read_xls( # here::here(DATA_DIR, "2000-2010/us-est00int-01.xls"), # sheet = "Sheet1" # ) %>% # mutate(age_label = Age, Age = str_extract(Age, "\\d{1,3}"), Age = as.integer(Age) - str_detect(age_label, "Under") * 5) %>% # select(-age_label) %>% # gather(Year, value, -1:-2) %>% # spread(Group, value) %>% # mutate(Year = as.integer(Year)) %>% # filter(Year > 2000) pop[["2000-2010"]] <- readr::read_csv( here::here(DATA_DIR, "2000-2010", "us-est00int-alldata-5yr.csv") ) %>% select( Year = year, Age = AGEGRP, Total = TOT_POP, `Total Male` = TOT_MALE, `Total Female` = TOT_FEMALE ) %>% filter(Age != 0, Year > 2000, Year < 2010) %>% mutate(Age = (Age - 1) * 5) pop[["2010-2017"]] <- readr::read_csv( here::here(DATA_DIR, "2010-2017/PEP_2017_PEPSYASEXN_with_ann.csv"), skip = 1 ) %>% select(-1:-4) %>% gather(key, value, -Sex) %>% filter(str_detect(key, "Population Estimate")) %>% mutate(key = str_remove(key, "Pop.+\\) - "), key = str_remove(key, " Total - ")) %>% filter(!str_detect(key, "Total|Median")) %>% separate(key, c("Year", "Age"), sep = ";") %>% mutate( Year = as.integer(Year), Age = sub("+", "", Age, fixed = TRUE), Age = as.integer(Age), Sex = recode( Sex, "Both Sexes" = "Total", "Female" = "Total Female", "Male" = "Total Male" ) ) %>% filter(Year < 2016) %>% spread(Sex, value) # pop[["2018"]] <- readr::read_tsv( # here::here(DATA_DIR, "national-population-projections_2014-2060.txt") # ) %>% # select(-1) %>% # filter(!is.na(Year)) %>% # select(contains("Code"), contains("Pop")) %>% # {colnames(.) <- sub(" Code", "", colnames(.)); .} %>% # group_by(Year, Age) %>% # mutate(Total = sum(`Projected Populations`)) %>% # ungroup() %>% # spread(Gender, `Projected Populations`) %>% # mutate( # Age = sub("+", "", Age, fixed = TRUE), # Age = as.integer(Age) # ) %>% # filter(Year > 2017) pop[["2018"]] <- readr::read_csv( here::here(DATA_DIR, "np2017_d1.csv") ) %>% filter(ORIGIN == 0, RACE == 0) %>% gather(Age, n, -SEX:-YEAR) %>% mutate(SEX = c("Total", "Total Male", "Total Female")[SEX + 1]) %>% spread(SEX, n) %>% filter(Age != "TOTAL_POP") %>% mutate( Age = sub("POP_", "", Age, fixed = TRUE), Age = as.integer(Age) ) %>% select(Year = YEAR, Age, starts_with("Total")) fs::dir_create(here::here("data")) saveRDS(pop, here::here("data", "pop_estimates_by_age.rds"))