|
- library(dplyr)
- library(purrr)
- library(tidyr)
- library(stringr)
- library(readr)
- library(readxl)
- library(fs)
- # library(here)
- library(lubridate)
- library(forcats)
-
- DATA_DIR <- "data-raw"
-
- pop <- list()
-
- import_pre1980 <- function(file) {
- # cli::cat_bullet(file)
- year <- str_extract(file, "\\d{4}\\.csv")
- year <- as.integer(sub("\\.csv", "", year))
- skip <- if (year >= 1960) 8 else 9
- x <- read_csv(file, col_names = FALSE, skip = skip,
- col_types = cols_only(
- X1 = col_guess(), X2 = col_guess(),
- X3 = col_guess(), X4 = col_guess()
-
- ))
- colnames(x) <- c("Age", "Total", "Total Male", "Total Female")
- x <- x[1:max(which(x$Age %in% c("75+", "85+"))), ]
- stopifnot(nrow(x) %in% c(76L, 86L))
- x %>%
- mutate(
- Year = year,
- Age = sub("\\+", "", Age),
- Age = as.integer(Age)
- ) %>%
- select(Year, everything())
- }
-
- pop[["1900-1980"]] <- fs::dir_ls(
- here::here(DATA_DIR, "1900-1980")
- ) %>%
- map_dfr(import_pre1980)
-
- import_1980_1990 <- function(file) {
- read_fwf(
- file,
- col_positions = fwf_positions(
- start = c(3, 5, 7, 12, 22, 32),
- end = c(4, 6, 9, 20, 30, 40),
- col_names = c("Month", "Year", "Age", "Total", "Total Male", "Total Female")
- ),
- skip = 0
- ) %>%
- filter(Age < 101, Month == 4, Year == min(Year)) %>%
- mutate(Year = Year + 1900)
- }
-
- pop[["1980-1990"]] <- fs::dir_ls(
- here::here(DATA_DIR, "1980-1990"),
- regexp = "TXT"
- ) %>%
- map_dfr(import_1980_1990)
-
- import_1990_2000 <- function(file) {
- x <- read_csv(file, col_names = FALSE, skip = 3)
- colnames(x) <- c("Month", "Age", "Total", "Total Male", "Total Female")
-
- x %>%
- filter(
- !is.na(Total),
- !str_detect(Age, "All")
- ) %>%
- mutate(Month = lubridate::parse_date_time(Month, "mdy")) %>%
- filter(
- lubridate::month(Month) == 4
- ) %>%
- mutate(
- Year = lubridate::year(Month),
- Age = str_remove(Age, "\\+"),
- Age = as.integer(Age)
- ) %>%
- select(Year, everything(), -Month)
- }
-
- pop[["1990-2000"]] <- fs::dir_ls(
- here::here(DATA_DIR, "1990-2000"),
- regexp = "\\d{4}\\.csv"
- ) %>%
- map_dfr(import_1990_2000)
-
- # pop[["2000-2010"]] <- read_csv(
- # here::here(DATA_DIR, "2000-2010/us-est00int-alldata.csv")
- # ) %>%
- # select(Month = MONTH, Year = YEAR, Age = AGE,
- # Total = TOT_POP, `Total Male` = TOT_MALE,
- # `Total Female` = TOT_FEMALE) %>%
- # filter(Year > 2000, Age <= 125)
-
- # pop[["2000-2010"]] <- readxl::read_xls(
- # here::here(DATA_DIR, "2000-2010/us-est00int-01.xls"),
- # sheet = "Sheet1"
- # ) %>%
- # mutate(age_label = Age, Age = str_extract(Age, "\\d{1,3}"), Age = as.integer(Age) - str_detect(age_label, "Under") * 5) %>%
- # select(-age_label) %>%
- # gather(Year, value, -1:-2) %>%
- # spread(Group, value) %>%
- # mutate(Year = as.integer(Year)) %>%
- # filter(Year > 2000)
-
- pop[["2000-2010"]] <-
- readr::read_csv(
- here::here(DATA_DIR, "2000-2010", "us-est00int-alldata-5yr.csv")
- ) %>%
- select(
- Year = year,
- Age = AGEGRP,
- Total = TOT_POP,
- `Total Male` = TOT_MALE,
- `Total Female` = TOT_FEMALE
- ) %>%
- filter(Age != 0, Year > 2000, Year < 2010) %>%
- mutate(Age = (Age - 1) * 5)
-
- pop[["2010-2017"]] <- readr::read_csv(
- here::here(DATA_DIR, "2010-2017/PEP_2017_PEPSYASEXN_with_ann.csv"),
- skip = 1
- ) %>%
- select(-1:-4) %>%
- gather(key, value, -Sex) %>%
- filter(str_detect(key, "Population Estimate")) %>%
- mutate(key = str_remove(key, "Pop.+\\) - "), key = str_remove(key, " Total - ")) %>%
- filter(!str_detect(key, "Total|Median")) %>%
- separate(key, c("Year", "Age"), sep = ";") %>%
- mutate(
- Year = as.integer(Year),
- Age = sub("+", "", Age, fixed = TRUE),
- Age = as.integer(Age),
- Sex = recode(
- Sex,
- "Both Sexes" = "Total",
- "Female" = "Total Female",
- "Male" = "Total Male"
- )
- ) %>%
- filter(Year < 2016) %>%
- spread(Sex, value)
-
- # pop[["2018"]] <- readr::read_tsv(
- # here::here(DATA_DIR, "national-population-projections_2014-2060.txt")
- # ) %>%
- # select(-1) %>%
- # filter(!is.na(Year)) %>%
- # select(contains("Code"), contains("Pop")) %>%
- # {colnames(.) <- sub(" Code", "", colnames(.)); .} %>%
- # group_by(Year, Age) %>%
- # mutate(Total = sum(`Projected Populations`)) %>%
- # ungroup() %>%
- # spread(Gender, `Projected Populations`) %>%
- # mutate(
- # Age = sub("+", "", Age, fixed = TRUE),
- # Age = as.integer(Age)
- # ) %>%
- # filter(Year > 2017)
-
- pop[["2018"]] <-
- readr::read_csv(
- here::here(DATA_DIR, "np2017_d1.csv")
- ) %>%
- filter(ORIGIN == 0, RACE == 0) %>%
- gather(Age, n, -SEX:-YEAR) %>%
- mutate(SEX = c("Total", "Total Male", "Total Female")[SEX + 1]) %>%
- spread(SEX, n) %>%
- filter(Age != "TOTAL_POP") %>%
- mutate(
- Age = sub("POP_", "", Age, fixed = TRUE),
- Age = as.integer(Age)
- ) %>%
- select(Year = YEAR, Age, starts_with("Total"))
-
- fs::dir_create(here::here("data"))
- saveRDS(pop, here::here("data", "pop_estimates_by_age.rds"))
|